From 1ef227f4824432f25db29bdfff22e8da94f960a1 Mon Sep 17 00:00:00 2001 From: byuu <2107894+byuu@users.noreply.github.com> Date: Wed, 25 Sep 2019 15:13:12 +0900 Subject: [PATCH] v110.2 Added CRT-Royale [hunterk] Improved libretro target [rtretiakov] --- bsnes/emulator/emulator.hpp | 2 +- bsnes/target-bsnes/GNUmakefile | 7 +- .../presentation/presentation.cpp | 2 +- bsnes/target-libretro/GNUmakefile | 14 +- bsnes/target-libretro/program.cpp | 5 +- bsnes/target-libretro/resources.hpp | 760 +- shaders/CRT-Royale.shader/bloom-approx.fs | 13973 +++++++++++++++ shaders/CRT-Royale.shader/bloom-approx.vs | 5859 +++++++ .../bloom-horizontal-reconstitute.fs | 7240 ++++++++ .../bloom-horizontal-reconstitute.vs | 6570 +++++++ shaders/CRT-Royale.shader/bloom-vertical.fs | 4824 +++++ shaders/CRT-Royale.shader/bloom-vertical.vs | 3792 ++++ .../CRT-Royale.shader/blur9fast-horizontal.fs | 2016 +++ .../CRT-Royale.shader/blur9fast-horizontal.vs | 2025 +++ .../CRT-Royale.shader/blur9fast-vertical.fs | 2016 +++ .../CRT-Royale.shader/blur9fast-vertical.vs | 2025 +++ shaders/CRT-Royale.shader/brightpass.fs | 14481 ++++++++++++++++ shaders/CRT-Royale.shader/brightpass.vs | 6551 +++++++ ...rst-pass-linearize-crt-gamma-bob-fields.fs | 4748 +++++ ...rst-pass-linearize-crt-gamma-bob-fields.vs | 4704 +++++ .../geometry-aa-last-pass.fs | 5279 ++++++ .../geometry-aa-last-pass.vs | 5263 ++++++ shaders/CRT-Royale.shader/manifest.bml | 214 + .../mask-resize-horizontal.fs | 3208 ++++ .../mask-resize-horizontal.vs | 3236 ++++ .../CRT-Royale.shader/mask-resize-vertical.fs | 3248 ++++ .../CRT-Royale.shader/mask-resize-vertical.vs | 3212 ++++ .../scanlines-horizontal-apply-mask.fs | 10845 ++++++++++++ .../scanlines-horizontal-apply-mask.vs | 6047 +++++++ .../scanlines-vertical-interlacing.fs | 5963 +++++++ .../scanlines-vertical-interlacing.vs | 5830 +++++++ ...nearApertureGrille15Wide8And5d5Spacing.png | Bin 0 -> 198848 bytes ...reGrille15Wide8And5d5SpacingResizeTo64.png | Bin 0 -> 4173 bytes .../textures/TileableLinearShadowMask.png | Bin 0 -> 218631 bytes .../textures/TileableLinearShadowMaskEDP.png | Bin 0 -> 206668 bytes .../TileableLinearShadowMaskEDPResizeTo64.png | Bin 0 -> 5373 bytes .../TileableLinearShadowMaskResizeTo64.png | Bin 0 -> 6008 bytes ...de9And4d5Horizontal9d14VerticalSpacing.png | Bin 0 -> 204254 bytes ...orizontal9d14VerticalSpacingResizeTo64.png | Bin 0 -> 6916 bytes 39 files changed, 133576 insertions(+), 383 deletions(-) create mode 100644 shaders/CRT-Royale.shader/bloom-approx.fs create mode 100644 shaders/CRT-Royale.shader/bloom-approx.vs create mode 100644 shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs create mode 100644 shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs create mode 100644 shaders/CRT-Royale.shader/bloom-vertical.fs create mode 100644 shaders/CRT-Royale.shader/bloom-vertical.vs create mode 100644 shaders/CRT-Royale.shader/blur9fast-horizontal.fs create mode 100644 shaders/CRT-Royale.shader/blur9fast-horizontal.vs create mode 100644 shaders/CRT-Royale.shader/blur9fast-vertical.fs create mode 100644 shaders/CRT-Royale.shader/blur9fast-vertical.vs create mode 100644 shaders/CRT-Royale.shader/brightpass.fs create mode 100644 shaders/CRT-Royale.shader/brightpass.vs create mode 100644 shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs create mode 100644 shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs create mode 100644 shaders/CRT-Royale.shader/geometry-aa-last-pass.fs create mode 100644 shaders/CRT-Royale.shader/geometry-aa-last-pass.vs create mode 100644 shaders/CRT-Royale.shader/manifest.bml create mode 100644 shaders/CRT-Royale.shader/mask-resize-horizontal.fs create mode 100644 shaders/CRT-Royale.shader/mask-resize-horizontal.vs create mode 100644 shaders/CRT-Royale.shader/mask-resize-vertical.fs create mode 100644 shaders/CRT-Royale.shader/mask-resize-vertical.vs create mode 100644 shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs create mode 100644 shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs create mode 100644 shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs create mode 100644 shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMask.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDP.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDPResizeTo64.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskResizeTo64.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png diff --git a/bsnes/emulator/emulator.hpp b/bsnes/emulator/emulator.hpp index b720b098..91e3be52 100644 --- a/bsnes/emulator/emulator.hpp +++ b/bsnes/emulator/emulator.hpp @@ -29,7 +29,7 @@ using namespace nall; namespace Emulator { static const string Name = "bsnes"; - static const string Version = "110.1"; + static const string Version = "110.2"; static const string Author = "byuu"; static const string License = "GPLv3"; static const string Website = "https://byuu.org"; diff --git a/bsnes/target-bsnes/GNUmakefile b/bsnes/target-bsnes/GNUmakefile index 79a3a705..7c7baa72 100644 --- a/bsnes/target-bsnes/GNUmakefile +++ b/bsnes/target-bsnes/GNUmakefile @@ -27,9 +27,11 @@ ifeq ($(platform),macos) mkdir -p out/$(name).app/Contents/MacOS/ mkdir -p out/$(name).app/Contents/MacOS/Database/ mkdir -p out/$(name).app/Contents/MacOS/Firmware/ + mkdir -p out/$(name).app/Contents/MacOS/Shaders/ mkdir -p out/$(name).app/Contents/Resources/ mv out/$(name) out/$(name).app/Contents/MacOS/$(name) cp Database/* out/$(name).app/Contents/MacOS/Database/ + cp -r ../shaders/* out/$(name).app/Contents/macOS/Shaders/ cp $(ui)/resource/$(name).plist out/$(name).app/Contents/Info.plist sips -s format icns $(ui)/resource/$(name).png --out out/$(name).app/Contents/Resources/$(name).icns endif @@ -44,6 +46,7 @@ else ifeq ($(platform),macos) mkdir -p ~/Library/Application\ Support/$(name)/ mkdir -p ~/Library/Application\ Support/$(name)/Database/ mkdir -p ~/Library/Application\ Support/$(name)/Firmware/ + mkdir -p ~/Library/Application\ Support/$(name)/Shaders/ cp -R out/$(name).app /Applications/$(name).app else ifneq ($(filter $(platform),linux bsd),) mkdir -p $(prefix)/bin/ @@ -52,12 +55,12 @@ else ifneq ($(filter $(platform),linux bsd),) mkdir -p $(prefix)/share/$(name)/ mkdir -p $(prefix)/share/$(name)/Database/ mkdir -p $(prefix)/share/$(name)/Firmware/ - mkdir -p $(prefix)/share/$(name)/Locale/ + mkdir -p $(prefix)/share/$(name)/Shaders/ cp out/$(name) $(prefix)/bin/$(name) cp $(ui)/resource/$(name).desktop $(prefix)/share/applications/$(name).desktop cp $(ui)/resource/$(name).png $(prefix)/share/icons/$(name).png cp Database/* $(prefix)/share/$(name)/Database/ - cp Locale/* $(prefix)/share/$(name)/Locale/ + cp -r ../shaders/* $(prefix)/share/$(name)/Shaders/ endif uninstall: diff --git a/bsnes/target-bsnes/presentation/presentation.cpp b/bsnes/target-bsnes/presentation/presentation.cpp index 94b8e204..700454e4 100644 --- a/bsnes/target-bsnes/presentation/presentation.cpp +++ b/bsnes/target-bsnes/presentation/presentation.cpp @@ -530,7 +530,7 @@ auto Presentation::updateShaders() -> void { }); shaders.append(blur); - auto location = locate("shaders/"); + auto location = locate("Shaders/"); if(settings.video.driver == "OpenGL 3.2") { for(auto shader : directory::folders(location, "*.shader")) { diff --git a/bsnes/target-libretro/GNUmakefile b/bsnes/target-libretro/GNUmakefile index 4c4cf7fb..104128e3 100644 --- a/bsnes/target-libretro/GNUmakefile +++ b/bsnes/target-libretro/GNUmakefile @@ -1,5 +1,7 @@ name := libretro.so -flags += -Wno-narrowing -Wno-multichar -fopenmp -g -fPIC +local := false +openmp := true +flags += -Wno-narrowing -Wno-multichar -g -fPIC objects := libretro $(objects) objects := $(patsubst %,obj/%.o,$(objects)) @@ -13,4 +15,14 @@ else ifeq ($(platform),windows) $(strip $(compiler) -o out/bsnes_libretro.dll -shared $(objects) -Wl,--no-undefined -Wl,--version-script=target-libretro/link.T -static-libgcc -static-libstdc++ -Wl,-Bstatic -lws2_32 -lpthread -lgomp -Wl,-Bdynamic) else ifeq ($(platform),macos) $(strip $(compiler) -o out/bsnes_libretro.dylib -shared $(objects) -lpthread -ldl) +else ifeq ($(platform), ios-arm64) + ifeq ($(IOSSDK),) + IOSSDK := $(shell xcodebuild -version -sdk iphoneos Path) + endif + $(strip c++ -arch arm64 -marm -miphoneos-version-min=11.0 -isysroot $(IOSSDK) -o out/bsnes_libretro_ios.dylib -shared $(objects) -lpthread -ldl) +else ifeq ($(platform), tvos-arm64) + ifeq ($(IOSSDK),) + IOSSDK := $(shell xcodebuild -version -sdk appletvos Path) + endif + $(strip c++ -arch arm64 -marm -mtvos-version-min=11.0 -isysroot $(IOSSDK) -o out/bsnes_libretro_tvos.dylib -shared $(objects) -lpthread -ldl) endif diff --git a/bsnes/target-libretro/program.cpp b/bsnes/target-libretro/program.cpp index c07365c7..cbfa4c37 100644 --- a/bsnes/target-libretro/program.cpp +++ b/bsnes/target-libretro/program.cpp @@ -146,6 +146,9 @@ auto Program::load() -> void { //fixes an errant scanline on the title screen due to writing to PPU registers too late if(title == "ADVENTURES OF FRANKEN" && region == "PAL") emulator->configure("Hacks/PPU/RenderCycle", 32); + //fixes an errant scanline on the title screen due to writing to PPU registers too late + if(title == "FIREPOWER 2000") emulator->configure("Hacks/PPU/RenderCycle", 32); + emulator->power(); } @@ -288,7 +291,7 @@ auto Program::openRomSuperFamicom(string name, vfs::file::mode mode) -> shared_p string save_path; auto suffix = Location::suffix(base_name); - auto base = Location::base(base_name); + auto base = Location::base(base_name.transform("\\", "/")); const char *save = nullptr; if (environ_cb && environ_cb(RETRO_ENVIRONMENT_GET_SAVE_DIRECTORY, &save) && save) diff --git a/bsnes/target-libretro/resources.hpp b/bsnes/target-libretro/resources.hpp index b0ba3fa6..1a438626 100644 --- a/bsnes/target-libretro/resources.hpp +++ b/bsnes/target-libretro/resources.hpp @@ -1,4 +1,4 @@ -const unsigned char boardsbml[30846] = { +const unsigned char boardsbml[31025] = { 100,97,116,97,98,97,115,101,10,32,32,114,101,118,105,115,105,111,110,58,32,50,48,49,56,45,48,55,45,50,53,10, 10,47,47,66,111,97,114,100,115,32,40,80,114,111,100,117,99,116,105,111,110,41,10,10,100,97,116,97,98,97,115,101, 10,32,32,114,101,118,105,115,105,111,110,58,32,50,48,49,56,45,48,53,45,49,54,10,10,98,111,97,114,100,58,32, @@ -571,398 +571,404 @@ const unsigned char boardsbml[30846] = { 109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32, 32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102, 102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58, - 48,48,48,48,45,102,102,102,102,10,10,47,47,66,111,97,114,100,115,32,40,71,101,110,101,114,105,99,41,10,10,100, - 97,116,97,98,97,115,101,10,32,32,114,101,118,105,115,105,111,110,58,32,50,48,49,56,45,48,55,45,50,53,10,10, - 98,111,97,114,100,58,32,65,82,77,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121, - 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97, - 100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115, - 107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,54,102,44,99, - 48,45,101,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109, - 111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97, - 112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10, - 32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32, - 32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,56,48,48,45,51, - 56,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61, - 80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,101, - 109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105, - 116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77, + 48,48,48,48,45,102,102,102,102,10,10,47,47,66,111,97,114,100,115,32,40,80,114,111,116,111,116,121,112,101,115,41, + 10,10,98,111,97,114,100,58,32,83,72,86,67,45,52,80,86,53,66,45,48,49,10,32,32,109,101,109,111,114,121,32, + 116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112, + 32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109, + 97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100, + 44,99,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,47,47, + 66,111,97,114,100,115,32,40,71,101,110,101,114,105,99,41,10,10,100,97,116,97,98,97,115,101,10,32,32,114,101,118, + 105,115,105,111,110,58,32,50,48,49,56,45,48,55,45,50,53,10,10,98,111,97,114,100,58,32,65,82,77,45,76,79, + 82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, + 116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44, + 56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32, + 109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,54,102,44,99,48,45,101,102,58,48,48,48,48,45,55,102,102, + 102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32, + 99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45, + 55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97, + 114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, + 61,48,48,45,51,102,44,56,48,45,98,102,58,51,56,48,48,45,51,56,102,102,10,32,32,32,32,109,101,109,111,114, + 121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105, + 116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77, 32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10, - 32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,66,83,45,72,73,82,79,77,45, - 82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114, - 111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57, - 102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,53, - 102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61, - 82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48, - 48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,109,97,112,32, - 97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32, - 32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,55,100,44,101,48,45,102,102,58,48,48,48,48,45,102, - 102,102,102,10,10,98,111,97,114,100,58,32,66,83,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114, - 121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61, - 48,120,48,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100, - 114,101,115,115,61,50,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,49,48,48,48, - 48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56, - 48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,50,48,48,48,48,48,32,109,97,115, - 107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,97,48,45,98,102,58,56, - 48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,49,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48, - 48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118, - 101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48, - 48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,61, - 66,83,77,101,109,111,114,121,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,101,102,58,48, - 48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,66,83,45,77,67,67,45,82,65,77,10,32,32,109,101, + 32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97, + 32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,111,115,99,105,108,108,97,116,111, + 114,10,10,98,111,97,114,100,58,32,66,83,45,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32, + 116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112, + 32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,10,32, + 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,102,44,99,48,45,100,102,58,48,48,48,48,45, + 102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83, + 97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58, + 54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,115,108,111,116,32,116,121,112, + 101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102, + 44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, + 61,54,48,45,55,100,44,101,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,66, + 83,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111, + 110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48, + 45,49,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,48,48,48,48,48,48,32,109,97,115,107, + 61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,58,56,48, + 48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,49,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48, + 48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,57,102,58,56,48,48,48,45,102,102,102, + 102,32,98,97,115,101,61,48,120,50,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32, + 109,97,112,32,97,100,100,114,101,115,115,61,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101, + 61,48,120,49,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116, + 121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100, + 114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61, + 48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,32,32,32,32, + 109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,101,102,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97, + 114,100,58,32,66,83,45,77,67,67,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77, + 32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,49,48, + 45,49,55,58,53,48,48,48,45,53,102,102,102,32,109,97,115,107,61,48,120,102,48,48,48,10,32,32,112,114,111,99, + 101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,77,67,67,10,32,32,32,32,109,97,112,32,97,100,100, + 114,101,115,115,61,48,48,45,48,102,58,53,48,48,48,45,53,102,102,102,10,32,32,32,32,109,99,117,10,32,32,32, + 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45, + 102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45, + 102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50, + 48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114, + 121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32, + 32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,111,119,110,108,111,97, + 100,10,32,32,32,32,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,10,98,111,97,114, + 100,58,32,66,83,45,83,65,49,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116, + 101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61, + 48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,45,50,51,102,102,10,32,32,32,32,109,99,117,10,32,32, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48, + 45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111, + 114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32, + 32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,109,101,109,111,114,121,32, + 116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32, + 97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105, + 122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,52, + 102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32, + 99,111,110,116,101,110,116,61,73,110,116,101,114,110,97,108,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101, + 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48,45,51,55,102,102,32,115,105,122,101,61,48,120, + 56,48,48,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,67,67,57,50,10,32,32,109,101,109,111,114,121,32, + 116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107, + 61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114, + 61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,56,50,49,52,10,32,32,32,32,105, + 100,101,110,116,105,102,105,101,114,58,32,67,97,109,112,117,115,32,67,104,97,108,108,101,110,103,101,32,39,57,50,10, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,44,101,48,58,48,48,48,48,10,32,32,32,32,109, + 99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102, + 58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77, + 32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121, + 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,49,10,32,32,32,32,32,32,109,101,109, + 111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,50,10,32,32,32, + 32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108, + 45,51,10,32,32,32,32,100,105,112,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117, + 114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32, + 32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102, + 102,102,32,109,97,115,107,61,48,120,55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82, + 79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61, + 117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116, + 101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32, + 32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97, + 114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116, + 111,114,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,80,70,57,52,10,32,32,109,101,109,111,114,121,32,116, + 121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100, + 114,101,115,115,61,51,48,45,51,102,44,98,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61, + 48,120,101,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,61, + 78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,56,50,49,52,10,32,32,32,32,105,100, + 101,110,116,105,102,105,101,114,58,32,80,111,119,101,114,70,101,115,116,32,39,57,52,10,32,32,32,32,109,97,112,32, + 97,100,100,114,101,115,115,61,49,48,44,50,48,58,54,48,48,48,10,32,32,32,32,109,99,117,10,32,32,32,32,32, + 32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102, + 102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45, + 102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101, + 110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77, + 32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,49,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121, + 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,50,10,32,32,32,32,32,32,109,101,109, + 111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,51,10,32,32,32, + 32,100,105,112,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,61,78,69, + 67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97, + 100,100,114,101,115,115,61,48,48,45,48,102,44,56,48,45,56,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115, + 107,61,48,120,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116, + 101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53, + 10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116, + 97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114, + 121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99, + 116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97, + 114,100,58,32,69,88,72,73,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111, + 110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48, + 45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32, + 109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,97,115,101, + 61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,98,102,58, + 56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,97,112,32, + 97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99, + 48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111, + 114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32, + 109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101, + 61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58, + 48,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32, + 97,100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99, + 48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48, + 45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112, + 101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101, + 115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120, + 101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,58,48,48,48,48,45, + 55,102,102,102,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,77,45,82,65,77,45,83,72,65,82,80,82,84, + 67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103, + 114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,48,48,48,45,102, + 102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115, + 115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32, + 109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48, + 45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,109,101, 109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,49,48,45,49,55,58,53,48,48,48,45,53,102,102,102,32,109,97,115,107,61, - 48,120,102,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,77,67, - 67,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,48,102,58,53,48,48,48,45,53,102,102, - 102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51, - 102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114, - 101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32, - 109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102, - 102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61, - 80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111, - 110,116,101,110,116,61,68,111,119,110,108,111,97,100,10,32,32,32,32,32,32,115,108,111,116,32,116,121,112,101,61,66, - 83,77,101,109,111,114,121,10,10,98,111,97,114,100,58,32,66,83,45,83,65,49,45,82,65,77,10,32,32,112,114,111, - 99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,32,32,32, - 32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,45,50,51, - 102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45, - 51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,48,48,48, - 10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102, - 102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116, - 61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114, - 121,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97, - 118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102, - 58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112, - 32,97,100,100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109, - 111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,73,110,116,101,114,110,97,108,10,32,32, - 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48, - 45,51,55,102,102,32,115,105,122,101,61,48,120,56,48,48,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,67, - 67,57,50,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97, - 118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48, - 48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114, - 32,109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117, - 80,68,55,56,50,49,52,10,32,32,32,32,105,100,101,110,116,105,102,105,101,114,58,32,67,97,109,112,117,115,32,67, - 104,97,108,108,101,110,103,101,32,39,57,50,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,44, - 101,48,58,48,48,48,48,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115, - 115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101, + 97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102, + 32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45, + 55,100,58,48,48,48,48,45,55,102,102,102,10,32,32,114,116,99,32,109,97,110,117,102,97,99,116,117,114,101,114,61, + 83,104,97,114,112,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98, + 102,58,50,56,48,48,45,50,56,48,49,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,84,67,32, + 99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,99,116,117,114,101,114,61,83,104,97,114,112,10, + 10,98,111,97,114,100,58,32,69,88,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79, + 77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115, + 115,61,48,48,45,55,100,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32, + 98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48, + 45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101, + 61,48,120,48,48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,88,76,79,82,79,77,45,82,65,77,10,32,32, + 109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,58,56,48,48,48,45,102,102,102,102,32, + 109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32, + 109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107, + 61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,48,48,48,48,48,48,10,32,32,109,101,109,111,114,121, + 32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97, + 100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115, + 107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,69,88,78,69,67,45,76,79,82,79,77,10,32,32,109, + 101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32, + 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45, + 102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99, + 104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101, + 115,115,61,54,48,45,54,55,44,101,48,45,101,55,58,48,48,48,48,45,51,102,102,102,10,32,32,32,32,109,101,109, + 111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99, + 104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121, + 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101, + 61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111, + 110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48, + 10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,56,45,54,102,44,101,56,45,101,102,58,48, + 48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,111,115,99,105,108,108,97, + 116,111,114,10,10,98,111,97,114,100,58,32,69,88,83,80,67,55,49,49,48,45,82,65,77,45,69,80,83,79,78,82, + 84,67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,69,120,112, + 97,110,115,105,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,52,102,58,48,48,48, + 48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,80, + 67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98, + 102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,53,48,44,53, + 56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107, + 61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102, + 102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,32,32, + 109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10, + 32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97, + 116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83, + 97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98, + 102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,114,116,99,32,109,97, + 110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, + 61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,52,48,45,52,56,52,50,10,32,32,32,32,109,101,109,111,114, + 121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,99,116, + 117,114,101,114,61,69,112,115,111,110,10,10,98,111,97,114,100,58,32,71,66,45,76,79,82,79,77,10,32,32,109,101, 109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32, - 32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101, - 108,45,49,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, - 116,61,76,101,118,101,108,45,50,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32, - 99,111,110,116,101,110,116,61,76,101,118,101,108,45,51,10,32,32,32,32,100,105,112,10,32,32,112,114,111,99,101,115, - 115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114, - 101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44, - 97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,55,102,102,102,10,32,32,32,32, - 109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32, - 97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32, - 116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117, - 114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99, - 111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53, - 10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,80,70, - 57,52,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118, - 101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,51,48,45,51,102,44,98,48,45,98,102,58,54,48, - 48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32, - 109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80, - 68,55,56,50,49,52,10,32,32,32,32,105,100,101,110,116,105,102,105,101,114,58,32,80,111,119,101,114,70,101,115,116, - 32,39,57,52,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,49,48,44,50,48,58,54,48,48,48,10, - 32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44, - 56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115, - 115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116, - 121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101, - 109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,49,10,32,32, - 32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101, - 108,45,50,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, - 116,61,76,101,118,101,108,45,51,10,32,32,32,32,100,105,112,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97, - 110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55, - 55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,48,102,44,56,48,45,56,102,58, + 32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102, + 102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61, + 52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48, + 48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,73,67,68,32,114,101,118, + 105,115,105,111,110,61,50,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48, + 45,98,102,58,54,48,48,48,45,54,55,102,102,44,55,48,48,48,45,55,102,102,102,10,32,32,32,32,109,101,109,111, + 114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,66,111,111,116,32,97,114,99,104,105,116,101, + 99,116,117,114,101,61,76,82,51,53,57,48,50,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,32,32,32, + 32,115,108,111,116,32,116,121,112,101,61,71,97,109,101,66,111,121,10,10,98,111,97,114,100,58,32,71,83,85,45,82, + 65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,71,83,85,10, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48, + 45,51,52,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, + 116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51, + 102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,102,44,99,48,45,100,102,58,48,48,48,48, + 45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110, + 116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56, + 48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,10,32,32,32,32,32, + 32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,49,44,102,48,45,102,49,58,48,48,48,48,45,102,102, + 102,102,10,10,98,111,97,114,100,58,32,72,73,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82, + 79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101, + 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112, + 32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10, + 98,111,97,114,100,58,32,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82, + 79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101, + 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112, + 32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32, + 32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32, + 32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55, + 102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,10,98,111,97,114,100,58,32,72,73,84,65,67,72,73,45, + 76,79,82,79,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,72, + 71,53,49,66,83,49,54,57,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56, + 48,45,98,102,58,54,99,48,48,45,54,102,102,102,44,55,99,48,48,45,55,102,102,102,10,32,32,32,32,109,101,109, + 111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32, + 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45, + 102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101, + 61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114, + 101,115,115,61,55,48,45,55,55,58,48,48,48,48,45,55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116, + 121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114, + 101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32, + 99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,72,71,53,49,66,83, + 49,54,57,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98, + 102,58,54,48,48,48,45,54,98,102,102,44,55,48,48,48,45,55,98,102,102,32,109,97,115,107,61,48,120,102,48,48, + 48,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,76,79,82,79,77,10,32, + 32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109, + 10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48, + 48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,76,79,82,79,77, + 45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80, + 114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45, + 102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114, + 121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32, + 97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97, + 115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,76,79,82,79,77,45,82,65,77,35,65,10,32,32, + 109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48, + 45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61, + 82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, + 61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48, + 48,48,10,10,98,111,97,114,100,58,32,78,69,67,45,72,73,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121, + 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97, + 100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32, + 32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102, + 102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55, + 55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58, 54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32, 116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101, 99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79, 77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55, 55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61, 68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115, - 99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,77,10,32,32,109,101,109,111,114, - 121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61, - 48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,48, - 48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97, - 100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48, - 48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45, - 102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,88,72,73,82, - 79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116, - 61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56, - 48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97, - 100,100,114,101,115,115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48, - 48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45, - 102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101, - 115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48, - 10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10, - 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48, - 45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115, - 115,61,55,48,45,55,100,58,48,48,48,48,45,55,102,102,102,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79, - 77,45,82,65,77,45,83,72,65,82,80,82,84,67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77, - 32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,48,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32, - 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98, - 97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45, - 98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61, - 48,120,99,48,48,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101, - 110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48, - 45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97, - 112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,58,48,48,48,48,45,55,102,102,102,10,32,32,114,116,99,32, - 109,97,110,117,102,97,99,116,117,114,101,114,61,83,104,97,114,112,10,32,32,32,32,109,97,112,32,97,100,100,114,101, - 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,56,48,48,45,50,56,48,49,10,32,32,32,32,109,101,109, - 111,114,121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97, - 99,116,117,114,101,114,61,83,104,97,114,112,10,10,98,111,97,114,100,58,32,69,88,76,79,82,79,77,10,32,32,109, - 101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32, - 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,58,56,48,48,48,45,102,102,102,102,32,109, - 97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61, - 48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,48,48,48,48,48,48,10,10,98,111,97,114,100,58,32,69, - 88,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110, - 116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45, - 55,100,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61, - 48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,102,102,58,56, - 48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,48,48, - 48,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83, - 97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58, - 48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,69,88, - 78,69,67,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116, - 101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55, - 100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32, - 112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10, - 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,54,55,44,101,48,45,101,55,58,48,48,48,48, - 45,51,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, - 116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10, - 32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97, - 32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,101,109,111,114, - 121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99, - 116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61, - 54,56,45,54,102,44,101,56,45,101,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48, - 48,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,69,88,83,80,67,55,49, - 49,48,45,82,65,77,45,69,80,83,79,78,82,84,67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79, - 77,32,99,111,110,116,101,110,116,61,69,120,112,97,110,115,105,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114, - 101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32, - 105,100,101,110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101, - 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112, - 32,97,100,100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117, - 10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56, - 48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112, - 32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120, - 99,48,48,48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110, - 116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82, - 79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61, - 82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101, - 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120, - 101,48,48,48,10,32,32,114,116,99,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,32,32, - 32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,52,48,45,52, - 56,52,50,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61, - 84,105,109,101,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,10,98,111,97,114,100,58,32, - 71,66,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101, - 110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100, - 44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32, - 32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,55,102, - 102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116, - 105,102,105,101,114,61,73,67,68,32,114,101,118,105,115,105,111,110,61,50,10,32,32,32,32,109,97,112,32,97,100,100, - 114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,54,55,102,102,44,55,48,48,48,45, - 55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116, - 61,66,111,111,116,32,97,114,99,104,105,116,101,99,116,117,114,101,61,76,82,51,53,57,48,50,10,32,32,32,32,111, - 115,99,105,108,108,97,116,111,114,10,32,32,32,32,115,108,111,116,32,116,121,112,101,61,71,97,109,101,66,111,121,10, - 10,98,111,97,114,100,58,32,71,83,85,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104, - 105,116,101,99,116,117,114,101,61,71,83,85,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45, - 51,102,44,56,48,45,98,102,58,51,48,48,48,45,51,52,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121, - 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,97,112, - 32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109, - 97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45, - 53,102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121, - 112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100, - 100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,122,101, - 61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,49,44, - 102,48,45,102,49,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,72,73,82,79,77,10,32,32, - 109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10, - 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48, - 45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102, - 102,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,72,73,82,79,77,45,82,65,77,10,32,32, - 109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10, - 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48, - 45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102, - 102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111, - 110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102, - 44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,10,98,111, - 97,114,100,58,32,72,73,84,65,67,72,73,45,76,79,82,79,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97, - 114,99,104,105,116,101,99,116,117,114,101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,109,97,112,32,97,100, - 100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,99,48,48,45,54,102,102,102,44,55,99,48,48, - 45,55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, - 116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51, - 102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32, - 32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32, - 32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,55,58,48,48,48,48,45,55,102,102,102, - 10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116, - 97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,109,101,109, - 111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116, - 101,99,116,117,114,101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101, - 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,54,98,102,102,44,55,48,48,48,45,55,98, - 102,102,32,109,97,115,107,61,48,120,102,48,48,48,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98, - 111,97,114,100,58,32,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111, - 110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48, - 45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10, - 10,98,111,97,114,100,58,32,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61, - 82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114, - 101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48, - 120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61, - 83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102, - 58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,76, - 79,82,79,77,45,82,65,77,35,65,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110, - 116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45, - 51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32, - 32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32, - 32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102, - 102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,78,69,67,45,72,73,82,79, - 77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103, - 114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58, - 56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44, - 99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104, - 105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,48,48,45,49,102,44,56,48,45,57,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,102,102, - 102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114, - 111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109, - 101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104, - 105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101, - 61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117, - 80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69, - 67,45,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111, - 110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48, - 45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114, - 101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111, - 114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112, - 32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109, - 97,115,107,61,48,120,101,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116, - 117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49, - 102,44,56,48,45,57,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,102,102,102,10,32,32,32, + 99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,72,73,82,79,77,45,82,65,77,10,32, + 32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109, + 10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48, + 48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45, + 102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99, + 111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51, + 102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32, + 112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32, + 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,54,48,48,48,45, + 55,102,102,102,32,109,97,115,107,61,48,120,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61, + 82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101, + 61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110, + 116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32, + 32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32, + 97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97, + 116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116, + 121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32, + 97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,109,97, + 115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117, + 114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,51,48,45,51,102, + 44,98,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,51,102,102,102,10,32,32,32, 32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109, 32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121, 32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116, 117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32, 99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50, 53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,76,79,82, - 79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111, - 103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102, - 58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115, - 111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32, - 97,100,100,114,101,115,115,61,51,48,45,51,102,44,98,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97, - 115,107,61,48,120,51,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111, - 110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55, - 50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68, - 97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109, - 111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116, - 101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98, - 111,97,114,100,58,32,78,69,67,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112, - 101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100, - 100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107, - 61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110, - 116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45, - 102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101, - 115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97, - 112,32,97,100,100,114,101,115,115,61,54,48,45,54,102,44,101,48,45,101,102,58,48,48,48,48,45,55,102,102,102,32, - 109,97,115,107,61,48,120,51,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32, - 99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68, - 55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116, - 61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109, - 101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104, - 105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10, - 10,98,111,97,114,100,58,32,78,69,67,45,76,79,82,79,77,45,82,65,77,35,65,10,32,32,109,101,109,111,114,121, - 32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97, - 112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32, - 109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111, - 110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100, - 44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99, - 104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115, - 115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,51, - 102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61, - 80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32, - 32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114, - 99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121, - 112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101, - 61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32, - 79,66,67,49,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77, - 32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48, - 48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,79,66,67,49,10,32, - 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45, - 55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,55,48,45,55,49,44,102,48,45,102,49,58,54,48,48,48,45,55,102,102,102,44,101,48,48,48,45,102,102,102,102, - 32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77, - 32,99,111,110,116,101,110,116,61,83,97,118,101,10,10,98,111,97,114,100,58,32,83,65,49,45,82,65,77,10,32,32, - 112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10, - 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48, - 45,50,51,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61, - 48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56, - 48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48, - 45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116, - 101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32, - 99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48, - 48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48, - 10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102, - 102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,73, - 110,116,101,114,110,97,108,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44, - 56,48,45,98,102,58,51,48,48,48,45,51,55,102,102,32,115,105,122,101,61,48,120,56,48,48,10,10,98,111,97,114, - 100,58,32,83,68,68,49,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83, - 68,68,49,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58, - 52,56,48,48,45,52,56,48,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114, - 101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32, - 109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32, - 32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97, - 109,10,10,98,111,97,114,100,58,32,83,68,68,49,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101, - 61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115, + 79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116, + 61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56, + 48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109, + 111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97, + 112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32, + 109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99, + 116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45, + 54,102,44,101,48,45,101,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,51,102,102,102,10,32, + 32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114, + 97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111, + 114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101, + 99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65, + 77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55, + 55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,76, + 79,82,79,77,45,82,65,77,35,65,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110, + 116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45, + 49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32, + 32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32, + 32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102, + 102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68, + 55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102, + 58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,51,102,102,102,10,32,32,32,32,109,101,109,111,114, + 121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105, + 116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61, + 82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80, + 68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110, + 116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32, + 111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,79,66,67,49,45,76,79,82,79,77,45,82,65, + 77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103, + 114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58, + 56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111, + 114,32,105,100,101,110,116,105,102,105,101,114,61,79,66,67,49,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115, 115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101, - 48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,51,58,48,48,48,48,45,102, - 102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110, - 116,105,102,105,101,114,61,83,68,68,49,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51, - 102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,48,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32, - 109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102, - 102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102, - 102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, - 116,61,80,114,111,103,114,97,109,10,10,98,111,97,114,100,58,32,83,80,67,55,49,49,48,45,82,65,77,10,32,32, - 112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,32, - 32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56, - 51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,102, - 102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45, - 51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,48, - 10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102, - 102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121, - 112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109, - 111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,101, - 109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32, - 32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102, - 102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,10,98,111,97,114,100,58,32,83,80,67,55,49,49,48,45,82, - 65,77,45,69,80,83,79,78,82,84,67,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105, - 101,114,61,83,80,67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102, - 44,56,48,45,98,102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,53,48,44,53,56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102, - 32,109,97,115,107,61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115, - 61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32, - 32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111, - 103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101, - 110,116,61,68,97,116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116, - 101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102, - 44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,114, - 116,99,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,32,32,32,32,109,97,112,32,97,100, - 100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,52,48,45,52,56,52,50,10,32,32,32,32, - 109,101,109,111,114,121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110, - 117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,10,98,111,97,114,100,58,32,83,84,45,76,79,82,79,77, - 10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114, - 97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56, - 48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101, - 61,83,117,102,97,109,105,84,117,114,98,111,10,32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,97,112,32,97, - 100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115, - 107,61,48,120,56,48,48,48,10,32,32,32,32,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101, - 115,115,61,54,48,45,54,102,44,101,48,45,101,102,58,48,48,48,48,45,102,102,102,102,10,32,32,115,108,111,116,32, - 116,121,112,101,61,83,117,102,97,109,105,84,117,114,98,111,10,32,32,32,32,114,111,109,10,32,32,32,32,32,32,109, - 97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102, - 32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97, - 100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10, + 48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,49,44,102,48,45,102,49,58, + 54,48,48,48,45,55,102,102,102,44,101,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10, + 32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101, + 10,10,98,111,97,114,100,58,32,83,65,49,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99, + 104,105,116,101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,32,32,32,32,109,97,112,32,97,100,100,114,101, + 115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,45,50,51,102,102,10,32,32,32,32,109,99,117, + 10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56, + 48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,48,48,48,10,32,32,32,32,32,32,109,97,112, + 32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109, + 101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32, + 32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10, + 32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48, + 48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121, + 32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,73,110,116,101,114,110,97,108,10,32,32,32,32,32, + 32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48,45,51,55, + 102,102,32,115,105,122,101,61,48,120,56,48,48,10,10,98,111,97,114,100,58,32,83,68,68,49,10,32,32,112,114,111, + 99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,68,68,49,10,32,32,32,32,109,97,112,32,97, + 100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,48,102,10,32,32,32, + 32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45, + 98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99, + 48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101, + 61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,10,98,111,97,114,100,58,32,83,68,68, + 49,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61, + 83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102, + 58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97, + 100,100,114,101,115,115,61,55,48,45,55,51,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48, + 48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,68,68,49,10,32, + 32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45, + 52,56,48,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48, + 48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97, + 100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109, + 111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,10,98,111, + 97,114,100,58,32,83,80,67,55,49,49,48,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101, + 110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61, + 48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,10,32,32, + 32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48, + 45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48, + 48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110, + 116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32, + 99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77, + 32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61, + 48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48, + 48,10,10,98,111,97,114,100,58,32,83,80,67,55,49,49,48,45,82,65,77,45,69,80,83,79,78,82,84,67,10,32, + 32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32, + 32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52, + 56,51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102, + 102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48, + 45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48, + 48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102, + 102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116, + 121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101, + 109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109, + 101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32, + 32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55, + 102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,114,116,99,32,109,97,110,117,102,97,99,116,117,114, + 101,114,61,69,112,115,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56, + 48,45,98,102,58,52,56,52,48,45,52,56,52,50,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82, + 84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115, + 111,110,10,10,98,111,97,114,100,58,32,83,84,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112, + 101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100, + 100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107, + 61,48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,83,117,102,97,109,105,84,117,114,98,111,10, + 32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44, + 97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32, + 114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,54,102,44,101,48,45,101, + 102,58,48,48,48,48,45,102,102,102,102,10,32,32,115,108,111,116,32,116,121,112,101,61,83,117,102,97,109,105,84,117, + 114,98,111,10,32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48, + 45,53,102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10, + 32,32,32,32,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44, + 102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10, }; const unsigned char iplrom[64] = { diff --git a/shaders/CRT-Royale.shader/bloom-approx.fs b/shaders/CRT-Royale.shader/bloom-approx.fs new file mode 100644 index 00000000..a56c09d6 --- /dev/null +++ b/shaders/CRT-Royale.shader/bloom-approx.fs @@ -0,0 +1,13973 @@ +#version 150 + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +in Vertex { + vec2 vTexCoord; + vec2 tex_uv; + vec2 blur_dxdy; + vec2 uv_scanline_step; + float estimated_viewport_size_x; + vec2 texture_size_inv; + vec2 tex_uv_to_pixel_scale; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +#define ORIG_LINEARIZEDvideo_size sourceSize[1].xy +#define ORIG_LINEARIZEDtexture_size sourceSize[1].xy +#define ORIG_LINEARIZED source[1] + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +/////////////////////////////// END VERTEX INCLUDES ///////////////////////////// + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// output_size < video_size. +// 4.) output_size == video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (video_size/output_size)/texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(video_size/output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, output_size.x), [0, output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, output_size.x), [0, output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(video_size/output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +//#include "bloom-functions.h" + +//////////////////////////// BEGIN BLOOM-FUNCTIONS /////////////////////////// + +#ifndef BLOOM_FUNCTIONS_H +#define BLOOM_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These utility functions and constants help several passes determine the +// size and center texel weight of the phosphor bloom in a uniform manner. + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// We need to calculate the correct blur sigma using some .cgp constants: +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// output_size < video_size. +// 4.) output_size == video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (video_size/output_size)/texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(video_size/output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, output_size.x), [0, output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, output_size.x), [0, output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(video_size/output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +/////////////////////////////// BLOOM CONSTANTS ////////////////////////////// + +// Compute constants with manual inlines of the functions below: +static const float bloom_diff_thresh = 1.0/256.0; + + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +inline float get_absolute_scale_blur_sigma(const float thresh) +{ + // Requires: 1.) min_expected_triads must be a global float. The number + // of horizontal phosphor triads in the final image must be + // >= min_allowed_viewport_triads.x for realistic results. + // 2.) bloom_approx_scale_x must be a global float equal to the + // absolute horizontal scale of BLOOM_APPROX. + // 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x + // should be <= 1.1658025090 to keep the final result < + // 0.62666015625 (the largest sigma ensuring the largest + // unused texel weight stays < 1.0/256.0 for a 3x3 blur). + // 4.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum Gaussian sigma that will blur the pass + // output as much as it would have taken to blur away + // bloom_approx_scale_x horizontal phosphor triads. + // Description: + // BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd + // use the same blur sigma as the actual phosphor bloom and scale it down + // to the current resolution with (bloom_approx_scale_x/viewport_size_x), but + // we don't know the viewport size in this pass. Instead, we'll blur as + // much as it would take to blur away min_allowed_viewport_triads.x. This + // will blur "more than necessary" if the user actually uses more triads, + // but that's not terrible either, because blurring a constant fraction of + // the viewport may better resemble a true optical bloom anyway (since the + // viewport will generally be about the same fraction of each player's + // field of view, regardless of screen size and resolution). + // Assume an extremely large viewport size for asymptotic results. + return bloom_approx_scale_x/max_viewport_size_x * + get_min_sigma_to_blur_triad( + max_viewport_size_x/min_allowed_viewport_triads.x, thresh); +} + +inline float get_center_weight(const float sigma) +{ + // Given a Gaussian blur sigma, get the blur weight for the center texel. + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return get_fast_gaussian_weight_sum_inv(sigma); + #else + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + // Note: If the implementation uses a smaller blur than the max allowed, + // the worst case scenario is that the center weight will be overestimated, + // so we'll put a bit more energy into the brightpass...no huge deal. + // Then again, if the implementation uses a larger blur than the max + // "allowed" because of dynamic branching, the center weight could be + // underestimated, which is more of a problem...consider always using + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // 43x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + + w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + // 31x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + + w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + // 25x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + // 17x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + #else + // 9x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + const float center_weight = weight_sum_inv * weight_sum_inv; + return center_weight; + #endif +} + +inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // If sigma is static, we can safely branch and use the smallest blur + // that's big enough. Ignore #define hints, because we'll only use a + // large blur if we actually need it, and the branches cost nothing. + #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #else + // It's still worth branching if the profile supports dynamic branches: + // It's much faster than using a hugely excessive blur, but each branch + // eats ~1% FPS. + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #endif + #endif + // Failed optimization notes: + // I originally created a same-size mipmapped 5-tap separable blur10 that + // could handle any sigma by reaching into lower mip levels. It was + // as fast as blur25fast for runtime sigmas and a tad faster than + // blur31fast for static sigmas, but mipmapping two viewport-size passes + // ate 10% of FPS across all codepaths, so it wasn't worth it. + #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + if(sigma <= blur9_std_dev) + { + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur17_std_dev) + { + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur25_std_dev) + { + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur31_std_dev) + { + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + } + else + { + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + } + #else + // If we can't afford to branch, we can only guess at what blur + // size we need. Therefore, use the largest blur allowed. + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + #else + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE +} + +inline float get_bloom_approx_sigma(const float output_size_x_runtime, + const float estimated_viewport_size_x) +{ + // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x. + // This is included for dynamic codepaths just in case the + // following two globals are incorrect: + // 2.) bloom_approx_size_x_for_skip should == the same + // if PHOSPHOR_BLOOM_FAKE is #defined + // 3.) bloom_approx_size_x should == the same otherwise + // Returns: For gaussian4x4, return a dynamic small bloom sigma that's + // as close to optimal as possible given available information. + // For blur3x3, return the a static small bloom sigma that + // works well for typical cases. Otherwise, we're using simple + // bilinear filtering, so use static calculations. + // Assume the default static value. This is a compromise that ensures + // typical triads are blurred, even if unusually large ones aren't. + static const float mask_num_triads_static = + max(min_allowed_viewport_triads.x, mask_num_triads_desired_static); + const float mask_num_triads_from_size = + estimated_viewport_size_x/mask_triad_size_desired; + const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x, + lerp(mask_num_triads_from_size, mask_num_triads_desired, + mask_specify_num_triads)); + // Assume an extremely large viewport size for asymptotic results: + static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // Use the runtime num triads and output size: + const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_runtime; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_runtime/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // account for the Gaussian scanline sigma from the last pass too. + // The bloom will be too wide horizontally but tall enough vertically. + return length(float2(bloom_approx_sigma, beam_max_sigma)); + } + else // 3x3 blur resize (the bilinear resize doesn't need a sigma) + { + // We're either using blur3x3 or bilinear filtering. The biggest + // reason to choose blur3x3 is to avoid dynamic weights, so use a + // static calculation. + #ifdef PHOSPHOR_BLOOM_FAKE + static const float output_size_x_static = + bloom_approx_size_x_for_fake; + #else + static const float output_size_x_static = bloom_approx_size_x; + #endif + static const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_static; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_static/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // try accounting for the Gaussian scanline sigma from the last pass + // too; use the static default value: + return length(float2(bloom_approx_sigma, beam_max_sigma_static)); + } +} + +inline float get_final_bloom_sigma(const float bloom_sigma_runtime) +{ + // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's + // optimal for the [known] triad size. + // 2.) Call this from a fragment shader (not a vertex shader), + // or blurring with static sigmas won't be constant-folded. + // Returns: Return the optimistic static sigma if the triad size is + // known at compile time. Otherwise return the optimal runtime + // sigma (10% slower) or an implementation-specific compromise + // between an optimistic or pessimistic static sigma. + // Notes: Call this from the fragment shader, NOT the vertex shader, + // so static sigmas can be constant-folded! + const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad( + mask_triad_size_desired_static, bloom_diff_thresh); + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return bloom_sigma_runtime; + #else + // Overblurring looks as bad as underblurring, so assume average-size + // triads, not worst-case huge triads: + return bloom_sigma_optimistic; + #endif +} + + +#endif // BLOOM_FUNCTIONS_H + +//////////////////////////// END BLOOM-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +/////////////////////////////////// HELPERS ////////////////////////////////// + +float3 tex2Dresize_gaussian4x4(sampler2D tex, float2 tex_uv, float2 dxdy, float2 tex_size, float2 texture_size_inv, float2 tex_uv_to_pixel_scale, float sigma) +{ + // Requires: 1.) All requirements of gamma-management.h must be satisfied! + // 2.) filter_linearN must == "true" in your .cgp preset. + // 3.) mipmap_inputN must == "true" in your .cgp preset if + // output_size << SRC.video_size. + // 4.) dxdy should contain the uv pixel spacing: + // dxdy = max(float2(1.0), + // SRC.video_size/output_size)/SRC.texture_size; + // 5.) texture_size == SRC.texture_size + // 6.) texture_size_inv == float2(1.0)/SRC.texture_size + // 7.) tex_uv_to_pixel_scale == output_size * + // SRC.texture_size / SRC.video_size; + // 8.) sigma is the desired Gaussian standard deviation, in + // terms of output pixels. It should be < ~0.66171875 to + // ensure the first unused sample (outside the 4x4 box) has + // a weight < 1.0/256.0. + // Returns: A true 4x4 Gaussian resize of the input. + // Description: + // Given correct inputs, this Gaussian resizer samples 4 pixel locations + // along each downsized dimension and/or 4 texel locations along each + // upsized dimension. It computes dynamic weights based on the pixel-space + // distance of each sample from the destination pixel. It is arbitrarily + // resizable and higher quality than tex2Dblur3x3_resize, but it's slower. + // TODO: Move this to a more suitable file once there are others like it. + const float denom_inv = 0.5/(sigma*sigma); + // We're taking 4x4 samples, and we're snapping to texels for upsizing. + // Find texture coords for sample 5 (second row, second column): + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_uv = prev_texel * texture_size_inv; + const float2 snap = float2((dxdy.x <= texture_size_inv.x), (dxdy.y <= texture_size_inv.y)); + const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy; + const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap); + // Compute texture coords for other samples: + const float2 dx = float2(dxdy.x, 0.0); + const float2 sample0_uv = sample5_uv - dxdy; + const float2 sample10_uv = sample5_uv + dxdy; + const float2 sample15_uv = sample5_uv + 2.0 * dxdy; + const float2 sample1_uv = sample0_uv + dx; + const float2 sample2_uv = sample0_uv + 2.0 * dx; + const float2 sample3_uv = sample0_uv + 3.0 * dx; + const float2 sample4_uv = sample5_uv - dx; + const float2 sample6_uv = sample5_uv + dx; + const float2 sample7_uv = sample5_uv + 2.0 * dx; + const float2 sample8_uv = sample10_uv - 2.0 * dx; + const float2 sample9_uv = sample10_uv - dx; + const float2 sample11_uv = sample10_uv + dx; + const float2 sample12_uv = sample15_uv - 3.0 * dx; + const float2 sample13_uv = sample15_uv - 2.0 * dx; + const float2 sample14_uv = sample15_uv - dx; + // Load each sample: + float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb; + float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + float3 sample2 = tex2D_linearize(tex, dx).rgb; + float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb; + float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb; + float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb; + float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb; + float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb; + float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb; + float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb; + float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb; + float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb; + float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb; + float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb; + // Compute destination pixel offsets for each sample: + const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale; + const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel; + // Compute Gaussian sample weights: + const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv); + const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv); + const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv); + const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv); + const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv); + const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv); + const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv); + const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv); + const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv); + const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv); + const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv); + const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv); + const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv); + const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv); + const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv); + const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv); + const float weight_sum_inv = 1.0/( + w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + + w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15); + // Weight and sum the samples: + const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15; + return sum * weight_sum_inv; +} + +void main() { + // Would a viewport-relative size work better for this pass? (No.) + // PROS: + // 1.) Instead of writing an absolute size to user-cgp-constants.h, we'd + // write a viewport scale. That number could be used to directly scale + // the viewport-resolution bloom sigma and/or triad size to a smaller + // scale. This way, we could calculate an optimal dynamic sigma no + // matter how the dot pitch is specified. + // CONS: + // 1.) Texel smearing would be much worse at small viewport sizes, but + // performance would be much worse at large viewport sizes, so there + // would be no easy way to calculate a decent scale. + // 2.) Worse, we could no longer get away with using a constant-size blur! + // Instead, we'd have to face all the same difficulties as the real + // phosphor bloom, which requires static #ifdefs to decide the blur + // size based on the expected triad size...a dynamic value. + // 3.) Like the phosphor bloom, we'd have less control over making the blur + // size correct for an optical blur. That said, we likely overblur (to + // maintain brightness) more than the eye would do by itself: 20/20 + // human vision distinguishes ~1 arc minute, or 1/60 of a degree. The + // highest viewing angle recommendation I know of is THX's 40.04 degree + // recommendation, at which 20/20 vision can distinguish about 2402.4 + // lines. Assuming the "TV lines" definition, that means 1201.2 + // distinct light lines and 1201.2 distinct dark lines can be told + // apart, i.e. 1201.2 pairs of lines. This would correspond to 1201.2 + // pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total + // (if they're alternately lit). That's a max of 800.8 triads. Using + // a more popular 30 degree viewing angle recommendation, 20/20 vision + // can distinguish 1800 lines, or 600 triads of alternately lit + // phosphors. In contrast, we currently blur phosphors all the way + // down to 341.3 triads to ensure full brightness. + // 4.) Realistically speaking, we're usually just going to use bilinear + // filtering in this pass anyway, but it only works well to limit + // bandwidth if it's done at a small constant scale. + + // Get the constants we need to sample: +// const sampler2D texture = ORIG_LINEARIZED.texture; +// const float2 tex_uv = tex_uv; +// const float2 blur_dxdy = blur_dxdy; + const float2 texture_size_ = ORIG_LINEARIZEDtexture_size; +// const float2 texture_size_inv = texture_size_inv; +// const float2 tex_uv_to_pixel_scale = tex_uv_to_pixel_scale; + float2 tex_uv_r, tex_uv_g, tex_uv_b; + + if(beam_misconvergence) + { + const float2 uv_scanline_step = uv_scanline_step; + const float2 convergence_offsets_r = get_convergence_offsets_r_vector(); + const float2 convergence_offsets_g = get_convergence_offsets_g_vector(); + const float2 convergence_offsets_b = get_convergence_offsets_b_vector(); + tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step; + tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step; + tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step; + } + // Get the blur sigma: + const float bloom_approx_sigma = get_bloom_approx_sigma(output_size.x, + estimated_viewport_size_x); + + // Sample the resized and blurred texture, and apply convergence offsets if + // necessary. Applying convergence offsets here triples our samples from + // 16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and + // HALATION_BLUR 3 times at full resolution every time they're used. + float3 color_r, color_g, color_b, color; + if(bloom_approx_filter > 1.5) + { + // Use a 4x4 Gaussian resize. This is slower but technically correct. + if(beam_misconvergence) + { + color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r, + blur_dxdy, texture_size_, texture_size_inv, + tex_uv_to_pixel_scale, bloom_approx_sigma); + color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g, + blur_dxdy, texture_size_, texture_size_inv, + tex_uv_to_pixel_scale, bloom_approx_sigma); + color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b, + blur_dxdy, texture_size_, texture_size_inv, + tex_uv_to_pixel_scale, bloom_approx_sigma); + } + else + { + color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv, + blur_dxdy, texture_size_, texture_size_inv, + tex_uv_to_pixel_scale, bloom_approx_sigma); + } + } + else if(bloom_approx_filter > 0.5) + { + // Use a 3x3 resize blur. This is the softest option, because we're + // blurring already blurry bilinear samples. It doesn't play quite as + // nicely with convergence offsets, but it has its charms. + if(beam_misconvergence) + { + color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r, + blur_dxdy, bloom_approx_sigma); + color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g, + blur_dxdy, bloom_approx_sigma); + color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b, + blur_dxdy, bloom_approx_sigma); + } + else + { + color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy); + } + } + else + { + // Use bilinear sampling. This approximates a 4x4 Gaussian resize MUCH + // better than tex2Dblur3x3_resize for the very small sigmas we're + // likely to use at small output resolutions. (This estimate becomes + // too sharp above ~400x300, but the blurs break down above that + // resolution too, unless min_allowed_viewport_triads is high enough to + // keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.) + if(beam_misconvergence) + { + color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb; + color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb; + color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb; + } + else + { + color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb; + } + } + // Pack the colors from the red/green/blue beams into a single vector: + if(beam_misconvergence) + { + color = float3(color_r.r, color_g.g, color_b.b); + } + // Encode and output the blurred image: + FragColor = encode_output(float4(tex2D_linearize(ORIG_LINEARIZED, tex_uv))); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/bloom-approx.vs b/shaders/CRT-Royale.shader/bloom-approx.vs new file mode 100644 index 00000000..e4faac1e --- /dev/null +++ b/shaders/CRT-Royale.shader/bloom-approx.vs @@ -0,0 +1,5859 @@ +#version 150 + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 tex_uv; + vec2 blur_dxdy; + vec2 uv_scanline_step; + float estimated_viewport_size_x; + vec2 texture_size_inv; + vec2 tex_uv_to_pixel_scale; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; +uniform int phase; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +#define ORIG_LINEARIZEDvideo_size sourceSize[1].xy +#define ORIG_LINEARIZEDtexture_size sourceSize[1].xy + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +/////////////////////////////// END VERTEX INCLUDES ///////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + +void main() { + gl_Position = position; + vTexCoord = texCoord; + const float2 video_uv = vTexCoord * texture_size/video_size; + tex_uv = video_uv * ORIG_LINEARIZEDvideo_size / + ORIG_LINEARIZEDtexture_size; + // The last pass (vertical scanlines) had a viewport y scale, so we can + // use it to calculate a better runtime sigma: + estimated_viewport_size_x = + video_size.y * geom_aspect_ratio_x/geom_aspect_ratio_y; + + // Get the uv sample distance between output pixels. We're using a resize + // blur, so arbitrary upsizing will be acceptable if filter_linearN = + // "true," and arbitrary downsizing will be acceptable if mipmap_inputN = + // "true" too. The blur will be much more accurate if a true 4x4 Gaussian + // resize is used instead of tex2Dblur3x3_resize (which samples between + // texels even for upsizing). + const float2 dxdy_min_scale = ORIG_LINEARIZEDvideo_size/output_size; + const float2 texture_size_inv = float2(1.0, 1.0)/ORIG_LINEARIZEDtexture_size; + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // For upsizing, we'll snap to texels and sample the nearest 4. + const float2 dxdy_scale = max(dxdy_min_scale, float2(1.0, 1.0)); + blur_dxdy = dxdy_scale * texture_size_inv; + } + else + { + const float2 dxdy_scale = dxdy_min_scale; + blur_dxdy = dxdy_scale * texture_size_inv; + } + // tex2Dresize_gaussian4x4 needs to know a bit more than the other filters: + tex_uv_to_pixel_scale = output_size * + ORIG_LINEARIZEDtexture_size / ORIG_LINEARIZEDvideo_size; + //texture_size_inv = texture_size_inv; + + // Detecting interlacing again here lets us apply convergence offsets in + // this pass. il_step_multiple contains the (texel, scanline) step + // multiple: 1 for progressive, 2 for interlaced. + const float2 orig_video_size = ORIG_LINEARIZEDvideo_size; + const float y_step = 1.0 + float(is_interlaced(orig_video_size.y)); + const float2 il_step_multiple = float2(1.0, y_step); + // Get the uv distance between (texels, same-field scanlines): + uv_scanline_step = il_step_multiple / ORIG_LINEARIZEDtexture_size; +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs new file mode 100644 index 00000000..7750152c --- /dev/null +++ b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs @@ -0,0 +1,7240 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +// Dunno why this stuff didn't want to function in the vertex, but whatever +in Vertex { + vec2 vTexCoord; +// vec2 video_uv; +// vec2 scanline_tex_uv; +// vec2 halation_tex_uv; +// vec2 brightpass_tex_uv; +// vec2 bloom_tex_uv; + vec2 bloom_dxdy; + float bloom_sigma_runtime; +}; + + vec2 video_uv = vTexCoord; + vec2 scanline_tex_uv = vTexCoord; + vec2 halation_tex_uv = vTexCoord; + vec2 brightpass_tex_uv = vTexCoord; + vec2 bloom_tex_uv = vTexCoord; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define bloom_texture source[0] + +#define MASKED_SCANLINEStexture source[2] +#define MASKED_SCANLINEStexture_size sourceSize[2].xy +#define MASKED_SCANLINESvideo_size sourceSize[2].xy +#define HALATION_BLURtexture source[5] +#define HALATION_BLURtexture_size sourceSize[5].xy +#define HALATION_BLURvideo_size sourceSize[5].xy +#define BRIGHTPASStexture source[1] +#define BRIGHTPASStexture_size sourceSize[1].xy +#define BRIGHTPASSvideo_size sourceSize[1].xy + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-params.h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + +/////////////////////////// BEGIN FRAGMENT-INCLUDES /////////////////////////// + +//#include "bloom-functions.h" + +//////////////////////////// BEGIN BLOOM-FUNCTIONS /////////////////////////// + +#ifndef BLOOM_FUNCTIONS_H +#define BLOOM_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These utility functions and constants help several passes determine the +// size and center texel weight of the phosphor bloom in a uniform manner. + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// We need to calculate the correct blur sigma using some .cgp constants: +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// output_size < video_size. +// 4.) output_size == video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (video_size/output_size)/texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(video_size/output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, output_size.x), [0, output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, output_size.x), [0, output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(video_size/output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +/////////////////////////////// BLOOM CONSTANTS ////////////////////////////// + +// Compute constants with manual inlines of the functions below: +static const float bloom_diff_thresh = 1.0/256.0; + + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +inline float get_absolute_scale_blur_sigma(const float thresh) +{ + // Requires: 1.) min_expected_triads must be a global float. The number + // of horizontal phosphor triads in the final image must be + // >= min_allowed_viewport_triads.x for realistic results. + // 2.) bloom_approx_scale_x must be a global float equal to the + // absolute horizontal scale of BLOOM_APPROX. + // 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x + // should be <= 1.1658025090 to keep the final result < + // 0.62666015625 (the largest sigma ensuring the largest + // unused texel weight stays < 1.0/256.0 for a 3x3 blur). + // 4.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum Gaussian sigma that will blur the pass + // output as much as it would have taken to blur away + // bloom_approx_scale_x horizontal phosphor triads. + // Description: + // BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd + // use the same blur sigma as the actual phosphor bloom and scale it down + // to the current resolution with (bloom_approx_scale_x/viewport_size_x), but + // we don't know the viewport size in this pass. Instead, we'll blur as + // much as it would take to blur away min_allowed_viewport_triads.x. This + // will blur "more than necessary" if the user actually uses more triads, + // but that's not terrible either, because blurring a constant fraction of + // the viewport may better resemble a true optical bloom anyway (since the + // viewport will generally be about the same fraction of each player's + // field of view, regardless of screen size and resolution). + // Assume an extremely large viewport size for asymptotic results. + return bloom_approx_scale_x/max_viewport_size_x * + get_min_sigma_to_blur_triad( + max_viewport_size_x/min_allowed_viewport_triads.x, thresh); +} + +inline float get_center_weight(const float sigma) +{ + // Given a Gaussian blur sigma, get the blur weight for the center texel. + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return get_fast_gaussian_weight_sum_inv(sigma); + #else + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + // Note: If the implementation uses a smaller blur than the max allowed, + // the worst case scenario is that the center weight will be overestimated, + // so we'll put a bit more energy into the brightpass...no huge deal. + // Then again, if the implementation uses a larger blur than the max + // "allowed" because of dynamic branching, the center weight could be + // underestimated, which is more of a problem...consider always using + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // 43x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + + w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + // 31x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + + w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + // 25x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + // 17x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + #else + // 9x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + const float center_weight = weight_sum_inv * weight_sum_inv; + return center_weight; + #endif +} + +inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // If sigma is static, we can safely branch and use the smallest blur + // that's big enough. Ignore #define hints, because we'll only use a + // large blur if we actually need it, and the branches cost nothing. + #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #else + // It's still worth branching if the profile supports dynamic branches: + // It's much faster than using a hugely excessive blur, but each branch + // eats ~1% FPS. + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #endif + #endif + // Failed optimization notes: + // I originally created a same-size mipmapped 5-tap separable blur10 that + // could handle any sigma by reaching into lower mip levels. It was + // as fast as blur25fast for runtime sigmas and a tad faster than + // blur31fast for static sigmas, but mipmapping two viewport-size passes + // ate 10% of FPS across all codepaths, so it wasn't worth it. + #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + if(sigma <= blur9_std_dev) + { + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur17_std_dev) + { + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur25_std_dev) + { + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur31_std_dev) + { + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + } + else + { + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + } + #else + // If we can't afford to branch, we can only guess at what blur + // size we need. Therefore, use the largest blur allowed. + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + #else + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE +} + +inline float get_bloom_approx_sigma(const float output_size_x_runtime, + const float estimated_viewport_size_x) +{ + // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x. + // This is included for dynamic codepaths just in case the + // following two globals are incorrect: + // 2.) bloom_approx_size_x_for_skip should == the same + // if PHOSPHOR_BLOOM_FAKE is #defined + // 3.) bloom_approx_size_x should == the same otherwise + // Returns: For gaussian4x4, return a dynamic small bloom sigma that's + // as close to optimal as possible given available information. + // For blur3x3, return the a static small bloom sigma that + // works well for typical cases. Otherwise, we're using simple + // bilinear filtering, so use static calculations. + // Assume the default static value. This is a compromise that ensures + // typical triads are blurred, even if unusually large ones aren't. + static const float mask_num_triads_static = + max(min_allowed_viewport_triads.x, mask_num_triads_desired_static); + const float mask_num_triads_from_size = + estimated_viewport_size_x/mask_triad_size_desired; + const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x, + lerp(mask_num_triads_from_size, mask_num_triads_desired, + mask_specify_num_triads)); + // Assume an extremely large viewport size for asymptotic results: + static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // Use the runtime num triads and output size: + const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_runtime; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_runtime/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // account for the Gaussian scanline sigma from the last pass too. + // The bloom will be too wide horizontally but tall enough vertically. + return length(float2(bloom_approx_sigma, beam_max_sigma)); + } + else // 3x3 blur resize (the bilinear resize doesn't need a sigma) + { + // We're either using blur3x3 or bilinear filtering. The biggest + // reason to choose blur3x3 is to avoid dynamic weights, so use a + // static calculation. + #ifdef PHOSPHOR_BLOOM_FAKE + static const float output_size_x_static = + bloom_approx_size_x_for_fake; + #else + static const float output_size_x_static = bloom_approx_size_x; + #endif + static const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_static; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_static/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // try accounting for the Gaussian scanline sigma from the last pass + // too; use the static default value: + return length(float2(bloom_approx_sigma, beam_max_sigma_static)); + } +} + +inline float get_final_bloom_sigma(const float bloom_sigma_runtime) +{ + // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's + // optimal for the [known] triad size. + // 2.) Call this from a fragment shader (not a vertex shader), + // or blurring with static sigmas won't be constant-folded. + // Returns: Return the optimistic static sigma if the triad size is + // known at compile time. Otherwise return the optimal runtime + // sigma (10% slower) or an implementation-specific compromise + // between an optimistic or pessimistic static sigma. + // Notes: Call this from the fragment shader, NOT the vertex shader, + // so static sigmas can be constant-folded! + const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad( + mask_triad_size_desired_static, bloom_diff_thresh); + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return bloom_sigma_runtime; + #else + // Overblurring looks as bad as underblurring, so assume average-size + // triads, not worst-case huge triads: + return bloom_sigma_optimistic; + #endif +} + + +#endif // BLOOM_FUNCTIONS_H + +//////////////////////////// END BLOOM-FUNCTIONS /////////////////////////// + +/////////////////////////// END FRAGMENT-INCLUDES ////////////////////////// + +void main() { + // Blur the vertically blurred brightpass horizontally by 9/17/25/43x: + const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime); + const float3 blurred_brightpass = tex2DblurNfast(bloom_texture, + bloom_tex_uv, bloom_dxdy, bloom_sigma); + + // Sample the masked scanlines. Alpha contains the auto-dim factor: + const float3 intensity_dim = + tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb; + const float auto_dim_factor = levels_autodim_temp; + const float undim_factor = 1.0/auto_dim_factor; + + // Calculate the mask dimpass, add it to the blurred brightpass, and + // undim (from scanline auto-dim) and amplify (from mask dim) the result: + const float mask_amplify = get_mask_amplify(); + const float3 brightpass = tex2D_linearize(BRIGHTPASStexture, + brightpass_tex_uv).rgb; + const float3 dimpass = intensity_dim - brightpass; + const float3 phosphor_bloom = (dimpass + blurred_brightpass) * + mask_amplify * undim_factor * levels_contrast; + + // Sample the halation texture, and let some light bleed into refractive + // diffusion. Conceptually this occurs before the phosphor bloom, but + // adding it in earlier passes causes black crush in the diffusion colors. + const float3 diffusion_color = levels_contrast * tex2D_linearize( + HALATION_BLURtexture, halation_tex_uv).rgb; + const float3 final_bloom = lerp(phosphor_bloom, + diffusion_color, diffusion_weight); + + // Encode and output the bloomed image: + FragColor = encode_output(float4(final_bloom, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs new file mode 100644 index 00000000..5d9ad005 --- /dev/null +++ b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs @@ -0,0 +1,6570 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +in vec4 position; +in vec2 texCoord; + +// These things didn't want to function in the vertex, so I just commented them +out Vertex { + vec2 vTexCoord; +// vec2 video_uv; +// vec2 scanline_tex_uv; +// vec2 halation_tex_uv; +// vec2 brightpass_tex_uv; +// vec2 bloom_tex_uv; + vec2 bloom_dxdy; + float bloom_sigma_runtime; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define MASKED_SCANLINEStexture source[2] +#define MASKED_SCANLINEStexture_size sourceSize[2].xy +#define MASKED_SCANLINESvideo_size sourceSize[2].xy +#define HALATION_BLURtexture source[5] +#define HALATION_BLURtexture_size sourceSize[5].xy +#define HALATION_BLURvideo_size sourceSize[5].xy +#define BRIGHTPASStexture source[1] +#define BRIGHTPASStexture_size sourceSize[1].xy +#define BRIGHTPASSvideo_size sourceSize[1].xy + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-params.h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +/////////////////////////////// VERTEX INCLUDES ////////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +////////////////////////////// END VERTEX-INCLUDES ////////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + +// copied from bloom-functions.h +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +void main() { + gl_Position = position; + vTexCoord = texCoord * 1.0001; + float2 tex_uv = vTexCoord.xy; + +// These things keep causing weird behavior and they're not needed except for NPOT, so... +/* // Our various input textures use different coords: + const float2 video_uv = tex_uv;// * texture_size/video_size; + video_uv = video_uv; + scanline_tex_uv = video_uv;// * MASKED_SCANLINESvideo_size / + MASKED_SCANLINEStexture_size; + halation_tex_uv = video_uv;// * HALATION_BLURvideo_size / + HALATION_BLURtexture_size; + brightpass_tex_uv = video_uv;// * BRIGHTPASSvideo_size / + BRIGHTPASStexture_size; + bloom_tex_uv = tex_uv; +*/ + // We're horizontally blurring the bloom input (vertically blurred + // brightpass). Get the uv distance between output pixels / input texels + // in the horizontal direction (this pass must NOT resize): + bloom_dxdy = float2(1.0/texture_size.x, 0.0); + + // Calculate a runtime bloom_sigma in case it's needed: + const float mask_tile_size_x = get_resized_mask_tile_size( + output_size, output_size * mask_resize_viewport_scale, false).x; + bloom_sigma_runtime = get_min_sigma_to_blur_triad( + mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/bloom-vertical.fs b/shaders/CRT-Royale.shader/bloom-vertical.fs new file mode 100644 index 00000000..4c37eee1 --- /dev/null +++ b/shaders/CRT-Royale.shader/bloom-vertical.fs @@ -0,0 +1,4824 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +in Vertex { + vec2 vTexCoord; + vec2 tex_uv; + vec2 bloom_dxdy; + float bloom_sigma_runtime; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define MASKED_SCANLINEStexture source[0] +#define MASKED_SCANLINEStexture_size sourceSize[0].xy +#define MASKED_SCANLINESvideo_size sourceSize[0].xy +#define BLOOM_APPROXtexture source[3] +#define BLOOM_APPROXtexture_size sourceSize[3].xy +#define BLOOM_APPROXvideo_size sourceSize[3].xy + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +float bloom_approx_scale_x = targetSize.y / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +//#include "bloom-functions.h" + +//////////////////////////// BEGIN BLOOM-FUNCTIONS /////////////////////////// + +#ifndef BLOOM_FUNCTIONS_H +#define BLOOM_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These utility functions and constants help several passes determine the +// size and center texel weight of the phosphor bloom in a uniform manner. + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// We need to calculate the correct blur sigma using some .cgp constants: +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// IN.output_size < IN.video_size. +// 4.) IN.output_size == IN.video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (IN.video_size/IN.output_size)/IN.texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(IN.video_size/IN.output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, IN.output_size.x), [0, IN.output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (IN.position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, IN.output_size.x), [0, IN.output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(IN.video_size/IN.output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +/////////////////////////////// BLOOM CONSTANTS ////////////////////////////// + +// Compute constants with manual inlines of the functions below: +static const float bloom_diff_thresh = 1.0/256.0; + + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +inline float get_absolute_scale_blur_sigma(const float thresh) +{ + // Requires: 1.) min_expected_triads must be a global float. The number + // of horizontal phosphor triads in the final image must be + // >= min_allowed_viewport_triads.x for realistic results. + // 2.) bloom_approx_scale_x must be a global float equal to the + // absolute horizontal scale of BLOOM_APPROX. + // 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x + // should be <= 1.1658025090 to keep the final result < + // 0.62666015625 (the largest sigma ensuring the largest + // unused texel weight stays < 1.0/256.0 for a 3x3 blur). + // 4.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum Gaussian sigma that will blur the pass + // output as much as it would have taken to blur away + // bloom_approx_scale_x horizontal phosphor triads. + // Description: + // BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd + // use the same blur sigma as the actual phosphor bloom and scale it down + // to the current resolution with (bloom_approx_scale_x/viewport_size_x), but + // we don't know the viewport size in this pass. Instead, we'll blur as + // much as it would take to blur away min_allowed_viewport_triads.x. This + // will blur "more than necessary" if the user actually uses more triads, + // but that's not terrible either, because blurring a constant fraction of + // the viewport may better resemble a true optical bloom anyway (since the + // viewport will generally be about the same fraction of each player's + // field of view, regardless of screen size and resolution). + // Assume an extremely large viewport size for asymptotic results. + return bloom_approx_scale_x/max_viewport_size_x * + get_min_sigma_to_blur_triad( + max_viewport_size_x/min_allowed_viewport_triads.x, thresh); +} + +inline float get_center_weight(const float sigma) +{ + // Given a Gaussian blur sigma, get the blur weight for the center texel. + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return get_fast_gaussian_weight_sum_inv(sigma); + #else + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + // Note: If the implementation uses a smaller blur than the max allowed, + // the worst case scenario is that the center weight will be overestimated, + // so we'll put a bit more energy into the brightpass...no huge deal. + // Then again, if the implementation uses a larger blur than the max + // "allowed" because of dynamic branching, the center weight could be + // underestimated, which is more of a problem...consider always using + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // 43x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + + w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + // 31x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + + w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + // 25x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + // 17x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + #else + // 9x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + const float center_weight = weight_sum_inv * weight_sum_inv; + return center_weight; + #endif +} + +inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // If sigma is static, we can safely branch and use the smallest blur + // that's big enough. Ignore #define hints, because we'll only use a + // large blur if we actually need it, and the branches cost nothing. + #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #else + // It's still worth branching if the profile supports dynamic branches: + // It's much faster than using a hugely excessive blur, but each branch + // eats ~1% FPS. + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #endif + #endif + // Failed optimization notes: + // I originally created a same-size mipmapped 5-tap separable blur10 that + // could handle any sigma by reaching into lower mip levels. It was + // as fast as blur25fast for runtime sigmas and a tad faster than + // blur31fast for static sigmas, but mipmapping two viewport-size passes + // ate 10% of FPS across all codepaths, so it wasn't worth it. + #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + if(sigma <= blur9_std_dev) + { + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur17_std_dev) + { + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur25_std_dev) + { + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur31_std_dev) + { + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + } + else + { + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + } + #else + // If we can't afford to branch, we can only guess at what blur + // size we need. Therefore, use the largest blur allowed. + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + #else + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE +} + +inline float get_bloom_approx_sigma(const float output_size_x_runtime, + const float estimated_viewport_size_x) +{ + // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x. + // This is included for dynamic codepaths just in case the + // following two globals are incorrect: + // 2.) bloom_approx_size_x_for_skip should == the same + // if PHOSPHOR_BLOOM_FAKE is #defined + // 3.) bloom_approx_size_x should == the same otherwise + // Returns: For gaussian4x4, return a dynamic small bloom sigma that's + // as close to optimal as possible given available information. + // For blur3x3, return the a static small bloom sigma that + // works well for typical cases. Otherwise, we're using simple + // bilinear filtering, so use static calculations. + // Assume the default static value. This is a compromise that ensures + // typical triads are blurred, even if unusually large ones aren't. + static const float mask_num_triads_static = + max(min_allowed_viewport_triads.x, mask_num_triads_desired_static); + const float mask_num_triads_from_size = + estimated_viewport_size_x/mask_triad_size_desired; + const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x, + lerp(mask_num_triads_from_size, mask_num_triads_desired, + mask_specify_num_triads)); + // Assume an extremely large viewport size for asymptotic results: + static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // Use the runtime num triads and output size: + const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_runtime; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_runtime/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // account for the Gaussian scanline sigma from the last pass too. + // The bloom will be too wide horizontally but tall enough vertically. + return length(float2(bloom_approx_sigma, beam_max_sigma)); + } + else // 3x3 blur resize (the bilinear resize doesn't need a sigma) + { + // We're either using blur3x3 or bilinear filtering. The biggest + // reason to choose blur3x3 is to avoid dynamic weights, so use a + // static calculation. + #ifdef PHOSPHOR_BLOOM_FAKE + static const float output_size_x_static = + bloom_approx_size_x_for_fake; + #else + static const float output_size_x_static = bloom_approx_size_x; + #endif + static const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_static; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_static/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // try accounting for the Gaussian scanline sigma from the last pass + // too; use the static default value: + return length(float2(bloom_approx_sigma, beam_max_sigma_static)); + } +} + +inline float get_final_bloom_sigma(const float bloom_sigma_runtime) +{ + // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's + // optimal for the [known] triad size. + // 2.) Call this from a fragment shader (not a vertex shader), + // or blurring with static sigmas won't be constant-folded. + // Returns: Return the optimistic static sigma if the triad size is + // known at compile time. Otherwise return the optimal runtime + // sigma (10% slower) or an implementation-specific compromise + // between an optimistic or pessimistic static sigma. + // Notes: Call this from the fragment shader, NOT the vertex shader, + // so static sigmas can be constant-folded! + const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad( + mask_triad_size_desired_static, bloom_diff_thresh); + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return bloom_sigma_runtime; + #else + // Overblurring looks as bad as underblurring, so assume average-size + // triads, not worst-case huge triads: + return bloom_sigma_optimistic; + #endif +} + + +#endif // BLOOM_FUNCTIONS_H + +//////////////////////////// END BLOOM-FUNCTIONS /////////////////////////// + +/////////////////////////// END FRAGMENT-INCLUDES ////////////////////////// + +void main() { + // Blur the brightpass horizontally with a 9/17/25/43x blur: + const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime); + const float3 color = tex2DblurNfast(input_texture, tex_uv, + bloom_dxdy, bloom_sigma); + // Encode and output the blurred image: + FragColor = encode_output(float4(color, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/bloom-vertical.vs b/shaders/CRT-Royale.shader/bloom-vertical.vs new file mode 100644 index 00000000..dfec96e6 --- /dev/null +++ b/shaders/CRT-Royale.shader/bloom-vertical.vs @@ -0,0 +1,3792 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 tex_uv; + vec2 bloom_dxdy; + float bloom_sigma_runtime; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define MASKED_SCANLINEStexture source[0] +#define MASKED_SCANLINEStexture_size sourceSize[0].xy +#define MASKED_SCANLINESvideo_size sourceSize[0].xy +#define BLOOM_APPROXtexture source[3] +#define BLOOM_APPROXtexture_size sourceSize[3].xy +#define BLOOM_APPROXvideo_size sourceSize[3].xy + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-params.h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == IN.output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +///////////////////////////// END VERTEX-INCLUDES //////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +float bloom_approx_scale_x = targetSize.y / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + +// copied from bloom-functions.h +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +void main() { + gl_Position = position; + vTexCoord = texCoord; + tex_uv = vTexCoord.xy * 1.0001; + + // Get the uv sample distance between output pixels. Calculate dxdy like + // blurs/vertex-shader-blur-fast-vertical.h. + const float2 dxdy_scale = video_size/output_size; + const float2 dxdy = dxdy_scale/texture_size; + // This blur is vertical-only, so zero out the vertical offset: + bloom_dxdy = float2(0.0, dxdy.y); + + // Calculate a runtime bloom_sigma in case it's needed: + const float mask_tile_size_x = get_resized_mask_tile_size( + output_size, output_size * mask_resize_viewport_scale, false).x; + bloom_sigma_runtime = get_min_sigma_to_blur_triad( + mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/blur9fast-horizontal.fs b/shaders/CRT-Royale.shader/blur9fast-horizontal.fs new file mode 100644 index 00000000..c7293eed --- /dev/null +++ b/shaders/CRT-Royale.shader/blur9fast-horizontal.fs @@ -0,0 +1,2016 @@ +#version 150 + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +#if __VERSION__ >= 130 +#define COMPAT_TEXTURE texture +#else +#define COMPAT_TEXTURE texture2D +#endif + +#ifdef GL_ES +#define COMPAT_PRECISION mediump +#else +#define COMPAT_PRECISION +#endif + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +in Vertex { + vec2 vTexCoord; + vec2 blur_dxdy; +}; + +out vec4 FragColor; + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +//#define GAMMA_ENCODE_EVERY_FBO +//#define FIRST_PASS +//#define LAST_PASS +//#define SIMULATE_CRT_ON_LCD +//#define SIMULATE_GBA_ON_LCD +//#define SIMULATE_LCD_ON_CRT +//#define SIMULATE_GBA_ON_CRT + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + float lcd_reference_gamma = 2.5; // To match CRT + float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + float get_crt_gamma() { return crt_gamma; } + float get_gba_gamma() { return gba_gamma; } + float get_lcd_gamma() { return lcd_gamma; } +#else + float get_crt_gamma() { return crt_reference_gamma_high; } + float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + float get_intermediate_gamma() { return intermediate_gamma; } + float get_input_gamma() { return input_gamma; } + float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + float get_input_gamma() { return get_crt_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + float get_input_gamma() { return get_lcd_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + float get_input_gamma() { return ntsc_gamma; } + float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + bool linearize_input = true; + float get_pass_input_gamma() { return get_input_gamma(); } + #else + bool linearize_input = false; + float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + bool gamma_encode_output = true; + float get_pass_output_gamma() { return get_output_gamma(); } + #else + bool gamma_encode_output = false; + float get_pass_output_gamma() { return 1.0; } + #endif +#else + bool linearize_input = true; + bool gamma_encode_output = true; + #ifdef FIRST_PASS + float get_pass_input_gamma() { return get_input_gamma(); } + #else + float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + float get_pass_output_gamma() { return get_output_gamma(); } + #else + float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +vec4 decode_input(vec4 color) +{ + if(linearize_input = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +vec4 encode_output(vec4 color) +{ + if(gamma_encode_output = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords))); } + +//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off))); } + +#endif // GAMMA_MANAGEMENT_H + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// IN.output_size < IN.video_size. +// 4.) IN.output_size == IN.video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (IN.video_size/IN.output_size)/IN.texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(IN.video_size/IN.output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static float blur3_std_dev +// static float blur4_std_dev +// static float blur5_std_dev +// static float blur6_std_dev +// static float blur7_std_dev +// static float blur8_std_dev +// static float blur9_std_dev +// static float blur10_std_dev +// static float blur11_std_dev +// static float blur12_std_dev +// static float blur17_std_dev +// static float blur25_std_dev +// static float blur31_std_dev +// static float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + float blur3_std_dev = 0.84931640625; + float blur4_std_dev = 0.84931640625; + float blur5_std_dev = 1.0595703125; + float blur6_std_dev = 1.06591796875; + float blur7_std_dev = 1.17041015625; + float blur8_std_dev = 1.1720703125; + float blur9_std_dev = 1.2259765625; + float blur10_std_dev = 1.21982421875; + float blur11_std_dev = 1.25361328125; + float blur12_std_dev = 1.2423828125; + float blur17_std_dev = 1.27783203125; + float blur25_std_dev = 1.2810546875; + float blur31_std_dev = 1.28125; + float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + float blur3_std_dev = 0.62666015625; + float blur4_std_dev = 0.66171875; + float blur5_std_dev = 0.9845703125; + float blur6_std_dev = 1.02626953125; + float blur7_std_dev = 1.36103515625; + float blur8_std_dev = 1.4080078125; + float blur9_std_dev = 1.7533203125; + float blur10_std_dev = 1.80478515625; + float blur11_std_dev = 2.15986328125; + float blur12_std_dev = 2.215234375; + float blur17_std_dev = 3.45535583496; + float blur25_std_dev = 5.3409576416; + float blur31_std_dev = 6.86488037109; + float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + float error_blurring = 0.5; +#endif + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" +//#include "quad-pixel-communication.h" +//#include "special-functions.h" + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (vec4/vec3/vec2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +vec4 erf6(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + vec4 one = vec4(1.0); + vec4 sign_x = sign(x); + vec4 t = one/(one + 0.47047*abs(x)); + vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec3 erf6(vec3 x) +{ + // vec3 version: + vec3 one = vec3(1.0); + vec3 sign_x = sign(x); + vec3 t = one/(one + 0.47047*abs(x)); + vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec2 erf6(vec2 x) +{ + // vec2 version: + vec2 one = vec2(1.0); + vec2 sign_x = sign(x); + vec2 t = one/(one + 0.47047*abs(x)); + vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(float x) +{ + // Float version: + float sign_x = sign(x); + float t = 1.0/(1.0 + 0.47047*abs(x)); + float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec4 erft(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +vec3 erft(vec3 x) +{ + // vec3 version: + return tanh(1.202760580 * x); +} + +vec2 erft(vec2 x) +{ + // vec2 version: + return tanh(1.202760580 * x); +} + +float erft(float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +vec4 erf(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec3 erf(vec3 x) +{ + // vec3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec2 erf(vec2 x) +{ + // vec2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +float erf(float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +vec4 gamma_impl(vec4 s, vec4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + vec4 g = vec4(1.12906830989); + vec4 c0 = vec4(0.8109119309638332633713423362694399653724431); + vec4 c1 = vec4(0.4808354605142681877121661197951496120000040); + vec4 e = vec4(2.71828182845904523536028747135266249775724709); + vec4 sph = s + vec4(0.5); + vec4 lanczos_sum = c0 + c1/(s + vec4(1.0)); + vec4 base = (sph + g)/e; // or (s + g + vec4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec3 gamma_impl(vec3 s, vec3 s_inv) +{ + // vec3 version: + vec3 g = vec3(1.12906830989); + vec3 c0 = vec3(0.8109119309638332633713423362694399653724431); + vec3 c1 = vec3(0.4808354605142681877121661197951496120000040); + vec3 e = vec3(2.71828182845904523536028747135266249775724709); + vec3 sph = s + vec3(0.5); + vec3 lanczos_sum = c0 + c1/(s + vec3(1.0)); + vec3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec2 gamma_impl(vec2 s, vec2 s_inv) +{ + // vec2 version: + vec2 g = vec2(1.12906830989); + vec2 c0 = vec2(0.8109119309638332633713423362694399653724431); + vec2 c1 = vec2(0.4808354605142681877121661197951496120000040); + vec2 e = vec2(2.71828182845904523536028747135266249775724709); + vec2 sph = s + vec2(0.5); + vec2 lanczos_sum = c0 + c1/(s + vec2(1.0)); + vec2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(float s, float s_inv) +{ + // Float version: + float g = 1.12906830989; + float c0 = 0.8109119309638332633713423362694399653724431; + float c1 = 0.4808354605142681877121661197951496120000040; + float e = 2.71828182845904523536028747135266249775724709; + float sph = s + 0.5; + float lanczos_sum = c0 + c1/(s + 1.0); + float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec4 gamma(vec4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, vec4(1.0)/s); +} + +vec3 gamma(vec3 s) +{ + // vec3 version: + return gamma_impl(s, vec3(1.0)/s); +} + +vec2 gamma(vec2 s) +{ + // vec2 version: + return gamma_impl(s, vec2(1.0)/s); +} + +float gamma(float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + vec4 scale = pow(z, s); + vec4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + vec4 z_sq = z*z; + vec4 denom1 = s + vec4(1.0); + vec4 denom2 = 2.0*s + vec4(4.0); + vec4 denom3 = 6.0*s + vec4(18.0); + //vec4 denom4 = 24.0*s + vec4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv) +{ + // vec3 version: + vec3 scale = pow(z, s); + vec3 sum = s_inv; + vec3 z_sq = z*z; + vec3 denom1 = s + vec3(1.0); + vec3 denom2 = 2.0*s + vec3(4.0); + vec3 denom3 = 6.0*s + vec3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv) +{ + // vec2 version: + vec2 scale = pow(z, s); + vec2 sum = s_inv; + vec2 z_sq = z*z; + vec2 denom1 = s + vec2(1.0); + vec2 denom2 = 2.0*s + vec2(4.0); + vec2 denom3 = 6.0*s + vec2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(float s, float z, float s_inv) +{ + // Float version: + float scale = pow(z, s); + float sum = s_inv; + float z_sq = z*z; + float denom1 = s + 1.0; + float denom2 = 2.0*s + 4.0; + float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +vec4 uigamma_large_z_impl(vec4 s, vec4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = vec4('inf'); + // vec4 one = vec4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + vec4 numerator = pow(z, s) * exp(-z); + vec4 denom = vec4(7.0) + z - s; + denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom; + denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom; + denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom; + return numerator / denom; +} + +vec3 uigamma_large_z_impl(vec3 s, vec3 z) +{ + // vec3 version: + vec3 numerator = pow(z, s) * exp(-z); + vec3 denom = vec3(7.0) + z - s; + denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom; + denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom; + denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom; + return numerator / denom; +} + +vec2 uigamma_large_z_impl(vec2 s, vec2 z) +{ + // vec2 version: + vec2 numerator = pow(z, s) * exp(-z); + vec2 denom = vec2(7.0) + z - s; + denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom; + denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom; + denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(float s, float z) +{ + // Float version: + float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +vec4 normalized_ligamma_impl(vec4 s, vec4 z, + vec4 s_inv, vec4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + vec4 thresh = vec4(0.775075); + bvec4 z_is_large = greaterThan(z , thresh); + vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0); + vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + return large_z * vec4(z_size_check) + small_z * vec4(z_size_check); +} + +vec3 normalized_ligamma_impl(vec3 s, vec3 z, + vec3 s_inv, vec3 gamma_s_inv) +{ + // vec3 version: + vec3 thresh = vec3(0.775075); + bvec3 z_is_large = greaterThan(z , thresh); + vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0); + vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec3(z_size_check) + small_z * vec3(z_size_check); +} + +vec2 normalized_ligamma_impl(vec2 s, vec2 z, + vec2 s_inv, vec2 gamma_s_inv) +{ + // vec2 version: + vec2 thresh = vec2(0.775075); + bvec2 z_is_large = greaterThan(z , thresh); + vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0); + vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec2(z_size_check) + small_z * vec2(z_size_check); +} + +float normalized_ligamma_impl(float s, float z, + float s_inv, float gamma_s_inv) +{ + // Float version: + float thresh = 0.775075; + float z_size_check = 0.0; + if (z > thresh) z_size_check = 1.0; + float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_size_check) + small_z * float(z_size_check); +} + +// Normalized lower incomplete gamma function for small s: +vec4 normalized_ligamma(vec4 s, vec4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + vec4 s_inv = vec4(1.0)/s; + vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec3 normalized_ligamma(vec3 s, vec3 z) +{ + // vec3 version: + vec3 s_inv = vec3(1.0)/s; + vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec2 normalized_ligamma(vec2 s, vec2 z) +{ + // vec2 version: + vec2 s_inv = vec2(1.0)/s; + vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(float s, float z) +{ + // Float version: + float s_inv = 1.0/s; + float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +/////////////////////////////////// HELPERS ////////////////////////////////// + +vec4 uv2_to_uv4(vec2 tex_uv) +{ + // Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords: + return vec4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +float get_fast_gaussian_weight_sum_inv(float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + vec3 sum = vec3(0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w45 = w4 + w5; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w9_10 = w9 + w10; + float w11_12 = w11 + w12; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + float w9_10_ratio = w10/w9_10; + float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + float w16 = exp(-256.0 * denom_inv); + float w17 = exp(-289.0 * denom_inv); + float w18 = exp(-324.0 * denom_inv); + float w19 = exp(-361.0 * denom_inv); + float w20 = exp(-400.0 * denom_inv); + float w21 = exp(-441.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w16_17 = w16 + w17; + float w18_19 = w18 + w19; + float w20_21 = w20 + w21; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + float w16_17_ratio = w17/w16_17; + float w18_19_ratio = w19/w18_19; + float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + +vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + vec2 sample4_uv = tex_uv; + vec2 dx = vec2(dxdy.x, 0.0); + vec2 dy = vec2(0.0, dxdy.y); + vec2 sample1_uv = sample4_uv - dy; + vec2 sample7_uv = sample4_uv + dy; + vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + float w4 = 1.0; + float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + vec3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + +// Resizable one-pass blurs: +vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w34 = w3 + w4; + float w12_ratio = w2/w12; + float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float w4off = exp(-16.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0); + vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio); + vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio); + vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2R1 = w3off; + float w2R2 = w4off; + float w3d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w3d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv); + float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv); + float w6d1 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv); + float w6d4 = exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2R1 + w2R2; + float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + float w5 = w4; + float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio); + vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio); + vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1abcd = 1.0; + float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv); + float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv); + float w1d4 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d3_3d2 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4_3d4 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d1 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d2_4d3 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = vec3(0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev); +} + +vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev); +} + +vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev); +} +vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev); +} + +vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev); +} +vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev); +} + +vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev); +} + +#endif // BLUR_FUNCTIONS_H + +#define Source source[0] +#define tex_uv vTexCoord.xy + +#define InputSize sourceSize[0].xy +#define TextureSize sourceSize[0].xy +#define OutputSize targetSize.xy + +void main() { + vec3 color = tex2Dblur9fast(Source, tex_uv, blur_dxdy); + // Encode and output the blurred image: + FragColor = encode_output(vec4(color, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/blur9fast-horizontal.vs b/shaders/CRT-Royale.shader/blur9fast-horizontal.vs new file mode 100644 index 00000000..7f3b2b94 --- /dev/null +++ b/shaders/CRT-Royale.shader/blur9fast-horizontal.vs @@ -0,0 +1,2025 @@ +#version 150 + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +#if __VERSION__ >= 130 +#define COMPAT_TEXTURE texture +#else +#define COMPAT_TEXTURE texture2D +#endif + +#ifdef GL_ES +#define COMPAT_PRECISION mediump +#else +#define COMPAT_PRECISION +#endif + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 blur_dxdy; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +//#define GAMMA_ENCODE_EVERY_FBO +//#define FIRST_PASS +//#define LAST_PASS +//#define SIMULATE_CRT_ON_LCD +//#define SIMULATE_GBA_ON_LCD +//#define SIMULATE_LCD_ON_CRT +//#define SIMULATE_GBA_ON_CRT + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + float lcd_reference_gamma = 2.5; // To match CRT + float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + float get_crt_gamma() { return crt_gamma; } + float get_gba_gamma() { return gba_gamma; } + float get_lcd_gamma() { return lcd_gamma; } +#else + float get_crt_gamma() { return crt_reference_gamma_high; } + float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + float get_intermediate_gamma() { return intermediate_gamma; } + float get_input_gamma() { return input_gamma; } + float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + float get_input_gamma() { return get_crt_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + float get_input_gamma() { return get_lcd_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + float get_input_gamma() { return ntsc_gamma; } + float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + bool linearize_input = true; + float get_pass_input_gamma() { return get_input_gamma(); } + #else + bool linearize_input = false; + float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + bool gamma_encode_output = true; + float get_pass_output_gamma() { return get_output_gamma(); } + #else + bool gamma_encode_output = false; + float get_pass_output_gamma() { return 1.0; } + #endif +#else + bool linearize_input = true; + bool gamma_encode_output = true; + #ifdef FIRST_PASS + float get_pass_input_gamma() { return get_input_gamma(); } + #else + float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + float get_pass_output_gamma() { return get_output_gamma(); } + #else + float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +vec4 decode_input(vec4 color) +{ + if(linearize_input = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +vec4 encode_output(vec4 color) +{ + if(gamma_encode_output = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords))); } + +//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off))); } + +#endif // GAMMA_MANAGEMENT_H + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// IN.output_size < IN.video_size. +// 4.) IN.output_size == IN.video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (IN.video_size/IN.output_size)/IN.texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(IN.video_size/IN.output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static float blur3_std_dev +// static float blur4_std_dev +// static float blur5_std_dev +// static float blur6_std_dev +// static float blur7_std_dev +// static float blur8_std_dev +// static float blur9_std_dev +// static float blur10_std_dev +// static float blur11_std_dev +// static float blur12_std_dev +// static float blur17_std_dev +// static float blur25_std_dev +// static float blur31_std_dev +// static float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + float blur3_std_dev = 0.84931640625; + float blur4_std_dev = 0.84931640625; + float blur5_std_dev = 1.0595703125; + float blur6_std_dev = 1.06591796875; + float blur7_std_dev = 1.17041015625; + float blur8_std_dev = 1.1720703125; + float blur9_std_dev = 1.2259765625; + float blur10_std_dev = 1.21982421875; + float blur11_std_dev = 1.25361328125; + float blur12_std_dev = 1.2423828125; + float blur17_std_dev = 1.27783203125; + float blur25_std_dev = 1.2810546875; + float blur31_std_dev = 1.28125; + float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + float blur3_std_dev = 0.62666015625; + float blur4_std_dev = 0.66171875; + float blur5_std_dev = 0.9845703125; + float blur6_std_dev = 1.02626953125; + float blur7_std_dev = 1.36103515625; + float blur8_std_dev = 1.4080078125; + float blur9_std_dev = 1.7533203125; + float blur10_std_dev = 1.80478515625; + float blur11_std_dev = 2.15986328125; + float blur12_std_dev = 2.215234375; + float blur17_std_dev = 3.45535583496; + float blur25_std_dev = 5.3409576416; + float blur31_std_dev = 6.86488037109; + float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + float error_blurring = 0.5; +#endif + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" +//#include "quad-pixel-communication.h" +//#include "special-functions.h" + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (vec4/vec3/vec2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +vec4 erf6(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + vec4 one = vec4(1.0); + vec4 sign_x = sign(x); + vec4 t = one/(one + 0.47047*abs(x)); + vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec3 erf6(vec3 x) +{ + // vec3 version: + vec3 one = vec3(1.0); + vec3 sign_x = sign(x); + vec3 t = one/(one + 0.47047*abs(x)); + vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec2 erf6(vec2 x) +{ + // vec2 version: + vec2 one = vec2(1.0); + vec2 sign_x = sign(x); + vec2 t = one/(one + 0.47047*abs(x)); + vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(float x) +{ + // Float version: + float sign_x = sign(x); + float t = 1.0/(1.0 + 0.47047*abs(x)); + float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec4 erft(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +vec3 erft(vec3 x) +{ + // vec3 version: + return tanh(1.202760580 * x); +} + +vec2 erft(vec2 x) +{ + // vec2 version: + return tanh(1.202760580 * x); +} + +float erft(float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +vec4 erf(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec3 erf(vec3 x) +{ + // vec3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec2 erf(vec2 x) +{ + // vec2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +float erf(float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +vec4 gamma_impl(vec4 s, vec4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + vec4 g = vec4(1.12906830989); + vec4 c0 = vec4(0.8109119309638332633713423362694399653724431); + vec4 c1 = vec4(0.4808354605142681877121661197951496120000040); + vec4 e = vec4(2.71828182845904523536028747135266249775724709); + vec4 sph = s + vec4(0.5); + vec4 lanczos_sum = c0 + c1/(s + vec4(1.0)); + vec4 base = (sph + g)/e; // or (s + g + vec4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec3 gamma_impl(vec3 s, vec3 s_inv) +{ + // vec3 version: + vec3 g = vec3(1.12906830989); + vec3 c0 = vec3(0.8109119309638332633713423362694399653724431); + vec3 c1 = vec3(0.4808354605142681877121661197951496120000040); + vec3 e = vec3(2.71828182845904523536028747135266249775724709); + vec3 sph = s + vec3(0.5); + vec3 lanczos_sum = c0 + c1/(s + vec3(1.0)); + vec3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec2 gamma_impl(vec2 s, vec2 s_inv) +{ + // vec2 version: + vec2 g = vec2(1.12906830989); + vec2 c0 = vec2(0.8109119309638332633713423362694399653724431); + vec2 c1 = vec2(0.4808354605142681877121661197951496120000040); + vec2 e = vec2(2.71828182845904523536028747135266249775724709); + vec2 sph = s + vec2(0.5); + vec2 lanczos_sum = c0 + c1/(s + vec2(1.0)); + vec2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(float s, float s_inv) +{ + // Float version: + float g = 1.12906830989; + float c0 = 0.8109119309638332633713423362694399653724431; + float c1 = 0.4808354605142681877121661197951496120000040; + float e = 2.71828182845904523536028747135266249775724709; + float sph = s + 0.5; + float lanczos_sum = c0 + c1/(s + 1.0); + float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec4 gamma(vec4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, vec4(1.0)/s); +} + +vec3 gamma(vec3 s) +{ + // vec3 version: + return gamma_impl(s, vec3(1.0)/s); +} + +vec2 gamma(vec2 s) +{ + // vec2 version: + return gamma_impl(s, vec2(1.0)/s); +} + +float gamma(float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + vec4 scale = pow(z, s); + vec4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + vec4 z_sq = z*z; + vec4 denom1 = s + vec4(1.0); + vec4 denom2 = 2.0*s + vec4(4.0); + vec4 denom3 = 6.0*s + vec4(18.0); + //vec4 denom4 = 24.0*s + vec4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv) +{ + // vec3 version: + vec3 scale = pow(z, s); + vec3 sum = s_inv; + vec3 z_sq = z*z; + vec3 denom1 = s + vec3(1.0); + vec3 denom2 = 2.0*s + vec3(4.0); + vec3 denom3 = 6.0*s + vec3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv) +{ + // vec2 version: + vec2 scale = pow(z, s); + vec2 sum = s_inv; + vec2 z_sq = z*z; + vec2 denom1 = s + vec2(1.0); + vec2 denom2 = 2.0*s + vec2(4.0); + vec2 denom3 = 6.0*s + vec2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(float s, float z, float s_inv) +{ + // Float version: + float scale = pow(z, s); + float sum = s_inv; + float z_sq = z*z; + float denom1 = s + 1.0; + float denom2 = 2.0*s + 4.0; + float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +vec4 uigamma_large_z_impl(vec4 s, vec4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = vec4('inf'); + // vec4 one = vec4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + vec4 numerator = pow(z, s) * exp(-z); + vec4 denom = vec4(7.0) + z - s; + denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom; + denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom; + denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom; + return numerator / denom; +} + +vec3 uigamma_large_z_impl(vec3 s, vec3 z) +{ + // vec3 version: + vec3 numerator = pow(z, s) * exp(-z); + vec3 denom = vec3(7.0) + z - s; + denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom; + denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom; + denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom; + return numerator / denom; +} + +vec2 uigamma_large_z_impl(vec2 s, vec2 z) +{ + // vec2 version: + vec2 numerator = pow(z, s) * exp(-z); + vec2 denom = vec2(7.0) + z - s; + denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom; + denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom; + denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(float s, float z) +{ + // Float version: + float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +vec4 normalized_ligamma_impl(vec4 s, vec4 z, + vec4 s_inv, vec4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + vec4 thresh = vec4(0.775075); + bvec4 z_is_large = greaterThan(z , thresh); + vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0); + vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + return large_z * vec4(z_size_check) + small_z * vec4(z_size_check); +} + +vec3 normalized_ligamma_impl(vec3 s, vec3 z, + vec3 s_inv, vec3 gamma_s_inv) +{ + // vec3 version: + vec3 thresh = vec3(0.775075); + bvec3 z_is_large = greaterThan(z , thresh); + vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0); + vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec3(z_size_check) + small_z * vec3(z_size_check); +} + +vec2 normalized_ligamma_impl(vec2 s, vec2 z, + vec2 s_inv, vec2 gamma_s_inv) +{ + // vec2 version: + vec2 thresh = vec2(0.775075); + bvec2 z_is_large = greaterThan(z , thresh); + vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0); + vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec2(z_size_check) + small_z * vec2(z_size_check); +} + +float normalized_ligamma_impl(float s, float z, + float s_inv, float gamma_s_inv) +{ + // Float version: + float thresh = 0.775075; + float z_size_check = 0.0; + if (z > thresh) z_size_check = 1.0; + float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_size_check) + small_z * float(z_size_check); +} + +// Normalized lower incomplete gamma function for small s: +vec4 normalized_ligamma(vec4 s, vec4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + vec4 s_inv = vec4(1.0)/s; + vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec3 normalized_ligamma(vec3 s, vec3 z) +{ + // vec3 version: + vec3 s_inv = vec3(1.0)/s; + vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec2 normalized_ligamma(vec2 s, vec2 z) +{ + // vec2 version: + vec2 s_inv = vec2(1.0)/s; + vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(float s, float z) +{ + // Float version: + float s_inv = 1.0/s; + float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +/////////////////////////////////// HELPERS ////////////////////////////////// + +vec4 uv2_to_uv4(vec2 tex_uv) +{ + // Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords: + return vec4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +float get_fast_gaussian_weight_sum_inv(float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + vec3 sum = vec3(0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w45 = w4 + w5; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w9_10 = w9 + w10; + float w11_12 = w11 + w12; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + float w9_10_ratio = w10/w9_10; + float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + float w16 = exp(-256.0 * denom_inv); + float w17 = exp(-289.0 * denom_inv); + float w18 = exp(-324.0 * denom_inv); + float w19 = exp(-361.0 * denom_inv); + float w20 = exp(-400.0 * denom_inv); + float w21 = exp(-441.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w16_17 = w16 + w17; + float w18_19 = w18 + w19; + float w20_21 = w20 + w21; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + float w16_17_ratio = w17/w16_17; + float w18_19_ratio = w19/w18_19; + float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + +vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + vec2 sample4_uv = tex_uv; + vec2 dx = vec2(dxdy.x, 0.0); + vec2 dy = vec2(0.0, dxdy.y); + vec2 sample1_uv = sample4_uv - dy; + vec2 sample7_uv = sample4_uv + dy; + vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + float w4 = 1.0; + float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + vec3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + +// Resizable one-pass blurs: +vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w34 = w3 + w4; + float w12_ratio = w2/w12; + float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float w4off = exp(-16.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0); + vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio); + vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio); + vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2R1 = w3off; + float w2R2 = w4off; + float w3d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w3d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv); + float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv); + float w6d1 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv); + float w6d4 = exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2R1 + w2R2; + float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + float w5 = w4; + float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio); + vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio); + vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1abcd = 1.0; + float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv); + float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv); + float w1d4 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d3_3d2 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4_3d4 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d1 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d2_4d3 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = vec3(0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev); +} + +vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev); +} + +vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev); +} +vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev); +} + +vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev); +} +vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev); +} + +vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev); +} + +#endif // BLUR_FUNCTIONS_H + +#define InputSize sourceSize[0].xy +#define TextureSize sourceSize[0].xy +#define OutputSize targetSize.xy + +void main() { + gl_Position = position; + vTexCoord = texCoord; + // Get the uv sample distance between output pixels. Blurs are not generic + // Gaussian resizers, and correct blurs require: + // 1.) OutputSize == InputSize * 2^m, where m is an integer <= 0. + // 2.) mipmap_inputN = "true" for this pass in the preset if m != 0 + // 3.) filter_linearN = "true" except for 1x scale nearest neighbor blurs + // Gaussian resizers would upsize using the distance between input texels + // (not output pixels), but we avoid this and consistently blur at the + // destination size. Otherwise, combining statically calculated weights + // with bilinear sample exploitation would result in terrible artifacts. + vec2 dxdy_scale = InputSize/OutputSize; + vec2 dxdy = dxdy_scale/TextureSize; + // This blur is vertical-only, so zero out the horizontal offset: + blur_dxdy = vec2(dxdy.x, 0.0); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/blur9fast-vertical.fs b/shaders/CRT-Royale.shader/blur9fast-vertical.fs new file mode 100644 index 00000000..c7293eed --- /dev/null +++ b/shaders/CRT-Royale.shader/blur9fast-vertical.fs @@ -0,0 +1,2016 @@ +#version 150 + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +#if __VERSION__ >= 130 +#define COMPAT_TEXTURE texture +#else +#define COMPAT_TEXTURE texture2D +#endif + +#ifdef GL_ES +#define COMPAT_PRECISION mediump +#else +#define COMPAT_PRECISION +#endif + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +in Vertex { + vec2 vTexCoord; + vec2 blur_dxdy; +}; + +out vec4 FragColor; + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +//#define GAMMA_ENCODE_EVERY_FBO +//#define FIRST_PASS +//#define LAST_PASS +//#define SIMULATE_CRT_ON_LCD +//#define SIMULATE_GBA_ON_LCD +//#define SIMULATE_LCD_ON_CRT +//#define SIMULATE_GBA_ON_CRT + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + float lcd_reference_gamma = 2.5; // To match CRT + float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + float get_crt_gamma() { return crt_gamma; } + float get_gba_gamma() { return gba_gamma; } + float get_lcd_gamma() { return lcd_gamma; } +#else + float get_crt_gamma() { return crt_reference_gamma_high; } + float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + float get_intermediate_gamma() { return intermediate_gamma; } + float get_input_gamma() { return input_gamma; } + float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + float get_input_gamma() { return get_crt_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + float get_input_gamma() { return get_lcd_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + float get_input_gamma() { return ntsc_gamma; } + float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + bool linearize_input = true; + float get_pass_input_gamma() { return get_input_gamma(); } + #else + bool linearize_input = false; + float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + bool gamma_encode_output = true; + float get_pass_output_gamma() { return get_output_gamma(); } + #else + bool gamma_encode_output = false; + float get_pass_output_gamma() { return 1.0; } + #endif +#else + bool linearize_input = true; + bool gamma_encode_output = true; + #ifdef FIRST_PASS + float get_pass_input_gamma() { return get_input_gamma(); } + #else + float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + float get_pass_output_gamma() { return get_output_gamma(); } + #else + float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +vec4 decode_input(vec4 color) +{ + if(linearize_input = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +vec4 encode_output(vec4 color) +{ + if(gamma_encode_output = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords))); } + +//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off))); } + +#endif // GAMMA_MANAGEMENT_H + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// IN.output_size < IN.video_size. +// 4.) IN.output_size == IN.video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (IN.video_size/IN.output_size)/IN.texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(IN.video_size/IN.output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static float blur3_std_dev +// static float blur4_std_dev +// static float blur5_std_dev +// static float blur6_std_dev +// static float blur7_std_dev +// static float blur8_std_dev +// static float blur9_std_dev +// static float blur10_std_dev +// static float blur11_std_dev +// static float blur12_std_dev +// static float blur17_std_dev +// static float blur25_std_dev +// static float blur31_std_dev +// static float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + float blur3_std_dev = 0.84931640625; + float blur4_std_dev = 0.84931640625; + float blur5_std_dev = 1.0595703125; + float blur6_std_dev = 1.06591796875; + float blur7_std_dev = 1.17041015625; + float blur8_std_dev = 1.1720703125; + float blur9_std_dev = 1.2259765625; + float blur10_std_dev = 1.21982421875; + float blur11_std_dev = 1.25361328125; + float blur12_std_dev = 1.2423828125; + float blur17_std_dev = 1.27783203125; + float blur25_std_dev = 1.2810546875; + float blur31_std_dev = 1.28125; + float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + float blur3_std_dev = 0.62666015625; + float blur4_std_dev = 0.66171875; + float blur5_std_dev = 0.9845703125; + float blur6_std_dev = 1.02626953125; + float blur7_std_dev = 1.36103515625; + float blur8_std_dev = 1.4080078125; + float blur9_std_dev = 1.7533203125; + float blur10_std_dev = 1.80478515625; + float blur11_std_dev = 2.15986328125; + float blur12_std_dev = 2.215234375; + float blur17_std_dev = 3.45535583496; + float blur25_std_dev = 5.3409576416; + float blur31_std_dev = 6.86488037109; + float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + float error_blurring = 0.5; +#endif + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" +//#include "quad-pixel-communication.h" +//#include "special-functions.h" + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (vec4/vec3/vec2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +vec4 erf6(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + vec4 one = vec4(1.0); + vec4 sign_x = sign(x); + vec4 t = one/(one + 0.47047*abs(x)); + vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec3 erf6(vec3 x) +{ + // vec3 version: + vec3 one = vec3(1.0); + vec3 sign_x = sign(x); + vec3 t = one/(one + 0.47047*abs(x)); + vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec2 erf6(vec2 x) +{ + // vec2 version: + vec2 one = vec2(1.0); + vec2 sign_x = sign(x); + vec2 t = one/(one + 0.47047*abs(x)); + vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(float x) +{ + // Float version: + float sign_x = sign(x); + float t = 1.0/(1.0 + 0.47047*abs(x)); + float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec4 erft(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +vec3 erft(vec3 x) +{ + // vec3 version: + return tanh(1.202760580 * x); +} + +vec2 erft(vec2 x) +{ + // vec2 version: + return tanh(1.202760580 * x); +} + +float erft(float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +vec4 erf(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec3 erf(vec3 x) +{ + // vec3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec2 erf(vec2 x) +{ + // vec2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +float erf(float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +vec4 gamma_impl(vec4 s, vec4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + vec4 g = vec4(1.12906830989); + vec4 c0 = vec4(0.8109119309638332633713423362694399653724431); + vec4 c1 = vec4(0.4808354605142681877121661197951496120000040); + vec4 e = vec4(2.71828182845904523536028747135266249775724709); + vec4 sph = s + vec4(0.5); + vec4 lanczos_sum = c0 + c1/(s + vec4(1.0)); + vec4 base = (sph + g)/e; // or (s + g + vec4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec3 gamma_impl(vec3 s, vec3 s_inv) +{ + // vec3 version: + vec3 g = vec3(1.12906830989); + vec3 c0 = vec3(0.8109119309638332633713423362694399653724431); + vec3 c1 = vec3(0.4808354605142681877121661197951496120000040); + vec3 e = vec3(2.71828182845904523536028747135266249775724709); + vec3 sph = s + vec3(0.5); + vec3 lanczos_sum = c0 + c1/(s + vec3(1.0)); + vec3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec2 gamma_impl(vec2 s, vec2 s_inv) +{ + // vec2 version: + vec2 g = vec2(1.12906830989); + vec2 c0 = vec2(0.8109119309638332633713423362694399653724431); + vec2 c1 = vec2(0.4808354605142681877121661197951496120000040); + vec2 e = vec2(2.71828182845904523536028747135266249775724709); + vec2 sph = s + vec2(0.5); + vec2 lanczos_sum = c0 + c1/(s + vec2(1.0)); + vec2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(float s, float s_inv) +{ + // Float version: + float g = 1.12906830989; + float c0 = 0.8109119309638332633713423362694399653724431; + float c1 = 0.4808354605142681877121661197951496120000040; + float e = 2.71828182845904523536028747135266249775724709; + float sph = s + 0.5; + float lanczos_sum = c0 + c1/(s + 1.0); + float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec4 gamma(vec4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, vec4(1.0)/s); +} + +vec3 gamma(vec3 s) +{ + // vec3 version: + return gamma_impl(s, vec3(1.0)/s); +} + +vec2 gamma(vec2 s) +{ + // vec2 version: + return gamma_impl(s, vec2(1.0)/s); +} + +float gamma(float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + vec4 scale = pow(z, s); + vec4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + vec4 z_sq = z*z; + vec4 denom1 = s + vec4(1.0); + vec4 denom2 = 2.0*s + vec4(4.0); + vec4 denom3 = 6.0*s + vec4(18.0); + //vec4 denom4 = 24.0*s + vec4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv) +{ + // vec3 version: + vec3 scale = pow(z, s); + vec3 sum = s_inv; + vec3 z_sq = z*z; + vec3 denom1 = s + vec3(1.0); + vec3 denom2 = 2.0*s + vec3(4.0); + vec3 denom3 = 6.0*s + vec3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv) +{ + // vec2 version: + vec2 scale = pow(z, s); + vec2 sum = s_inv; + vec2 z_sq = z*z; + vec2 denom1 = s + vec2(1.0); + vec2 denom2 = 2.0*s + vec2(4.0); + vec2 denom3 = 6.0*s + vec2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(float s, float z, float s_inv) +{ + // Float version: + float scale = pow(z, s); + float sum = s_inv; + float z_sq = z*z; + float denom1 = s + 1.0; + float denom2 = 2.0*s + 4.0; + float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +vec4 uigamma_large_z_impl(vec4 s, vec4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = vec4('inf'); + // vec4 one = vec4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + vec4 numerator = pow(z, s) * exp(-z); + vec4 denom = vec4(7.0) + z - s; + denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom; + denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom; + denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom; + return numerator / denom; +} + +vec3 uigamma_large_z_impl(vec3 s, vec3 z) +{ + // vec3 version: + vec3 numerator = pow(z, s) * exp(-z); + vec3 denom = vec3(7.0) + z - s; + denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom; + denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom; + denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom; + return numerator / denom; +} + +vec2 uigamma_large_z_impl(vec2 s, vec2 z) +{ + // vec2 version: + vec2 numerator = pow(z, s) * exp(-z); + vec2 denom = vec2(7.0) + z - s; + denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom; + denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom; + denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(float s, float z) +{ + // Float version: + float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +vec4 normalized_ligamma_impl(vec4 s, vec4 z, + vec4 s_inv, vec4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + vec4 thresh = vec4(0.775075); + bvec4 z_is_large = greaterThan(z , thresh); + vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0); + vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + return large_z * vec4(z_size_check) + small_z * vec4(z_size_check); +} + +vec3 normalized_ligamma_impl(vec3 s, vec3 z, + vec3 s_inv, vec3 gamma_s_inv) +{ + // vec3 version: + vec3 thresh = vec3(0.775075); + bvec3 z_is_large = greaterThan(z , thresh); + vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0); + vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec3(z_size_check) + small_z * vec3(z_size_check); +} + +vec2 normalized_ligamma_impl(vec2 s, vec2 z, + vec2 s_inv, vec2 gamma_s_inv) +{ + // vec2 version: + vec2 thresh = vec2(0.775075); + bvec2 z_is_large = greaterThan(z , thresh); + vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0); + vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec2(z_size_check) + small_z * vec2(z_size_check); +} + +float normalized_ligamma_impl(float s, float z, + float s_inv, float gamma_s_inv) +{ + // Float version: + float thresh = 0.775075; + float z_size_check = 0.0; + if (z > thresh) z_size_check = 1.0; + float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_size_check) + small_z * float(z_size_check); +} + +// Normalized lower incomplete gamma function for small s: +vec4 normalized_ligamma(vec4 s, vec4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + vec4 s_inv = vec4(1.0)/s; + vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec3 normalized_ligamma(vec3 s, vec3 z) +{ + // vec3 version: + vec3 s_inv = vec3(1.0)/s; + vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec2 normalized_ligamma(vec2 s, vec2 z) +{ + // vec2 version: + vec2 s_inv = vec2(1.0)/s; + vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(float s, float z) +{ + // Float version: + float s_inv = 1.0/s; + float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +/////////////////////////////////// HELPERS ////////////////////////////////// + +vec4 uv2_to_uv4(vec2 tex_uv) +{ + // Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords: + return vec4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +float get_fast_gaussian_weight_sum_inv(float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + vec3 sum = vec3(0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w45 = w4 + w5; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w9_10 = w9 + w10; + float w11_12 = w11 + w12; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + float w9_10_ratio = w10/w9_10; + float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + float w16 = exp(-256.0 * denom_inv); + float w17 = exp(-289.0 * denom_inv); + float w18 = exp(-324.0 * denom_inv); + float w19 = exp(-361.0 * denom_inv); + float w20 = exp(-400.0 * denom_inv); + float w21 = exp(-441.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w16_17 = w16 + w17; + float w18_19 = w18 + w19; + float w20_21 = w20 + w21; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + float w16_17_ratio = w17/w16_17; + float w18_19_ratio = w19/w18_19; + float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + +vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + vec2 sample4_uv = tex_uv; + vec2 dx = vec2(dxdy.x, 0.0); + vec2 dy = vec2(0.0, dxdy.y); + vec2 sample1_uv = sample4_uv - dy; + vec2 sample7_uv = sample4_uv + dy; + vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + float w4 = 1.0; + float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + vec3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + +// Resizable one-pass blurs: +vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w34 = w3 + w4; + float w12_ratio = w2/w12; + float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float w4off = exp(-16.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0); + vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio); + vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio); + vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2R1 = w3off; + float w2R2 = w4off; + float w3d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w3d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv); + float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv); + float w6d1 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv); + float w6d4 = exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2R1 + w2R2; + float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + float w5 = w4; + float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio); + vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio); + vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1abcd = 1.0; + float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv); + float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv); + float w1d4 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d3_3d2 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4_3d4 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d1 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d2_4d3 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = vec3(0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev); +} + +vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev); +} + +vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev); +} +vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev); +} + +vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev); +} +vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev); +} + +vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev); +} + +#endif // BLUR_FUNCTIONS_H + +#define Source source[0] +#define tex_uv vTexCoord.xy + +#define InputSize sourceSize[0].xy +#define TextureSize sourceSize[0].xy +#define OutputSize targetSize.xy + +void main() { + vec3 color = tex2Dblur9fast(Source, tex_uv, blur_dxdy); + // Encode and output the blurred image: + FragColor = encode_output(vec4(color, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/blur9fast-vertical.vs b/shaders/CRT-Royale.shader/blur9fast-vertical.vs new file mode 100644 index 00000000..8c10ad96 --- /dev/null +++ b/shaders/CRT-Royale.shader/blur9fast-vertical.vs @@ -0,0 +1,2025 @@ +#version 150 + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +#if __VERSION__ >= 130 +#define COMPAT_TEXTURE texture +#else +#define COMPAT_TEXTURE texture2D +#endif + +#ifdef GL_ES +#define COMPAT_PRECISION mediump +#else +#define COMPAT_PRECISION +#endif + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 blur_dxdy; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +//#define GAMMA_ENCODE_EVERY_FBO +//#define FIRST_PASS +//#define LAST_PASS +//#define SIMULATE_CRT_ON_LCD +//#define SIMULATE_GBA_ON_LCD +//#define SIMULATE_LCD_ON_CRT +//#define SIMULATE_GBA_ON_CRT + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + float lcd_reference_gamma = 2.5; // To match CRT + float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + float get_crt_gamma() { return crt_gamma; } + float get_gba_gamma() { return gba_gamma; } + float get_lcd_gamma() { return lcd_gamma; } +#else + float get_crt_gamma() { return crt_reference_gamma_high; } + float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + float get_intermediate_gamma() { return intermediate_gamma; } + float get_input_gamma() { return input_gamma; } + float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + float get_input_gamma() { return get_crt_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + float get_input_gamma() { return get_lcd_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + float get_input_gamma() { return ntsc_gamma; } + float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + bool linearize_input = true; + float get_pass_input_gamma() { return get_input_gamma(); } + #else + bool linearize_input = false; + float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + bool gamma_encode_output = true; + float get_pass_output_gamma() { return get_output_gamma(); } + #else + bool gamma_encode_output = false; + float get_pass_output_gamma() { return 1.0; } + #endif +#else + bool linearize_input = true; + bool gamma_encode_output = true; + #ifdef FIRST_PASS + float get_pass_input_gamma() { return get_input_gamma(); } + #else + float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + float get_pass_output_gamma() { return get_output_gamma(); } + #else + float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +vec4 decode_input(vec4 color) +{ + if(linearize_input = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +vec4 encode_output(vec4 color) +{ + if(gamma_encode_output = true) + { + if(assume_opaque_alpha = true) + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords))); } + +//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E))) +//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off) +//{ return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off))); } + +#endif // GAMMA_MANAGEMENT_H + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// IN.output_size < IN.video_size. +// 4.) IN.output_size == IN.video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (IN.video_size/IN.output_size)/IN.texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(IN.video_size/IN.output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static float blur3_std_dev +// static float blur4_std_dev +// static float blur5_std_dev +// static float blur6_std_dev +// static float blur7_std_dev +// static float blur8_std_dev +// static float blur9_std_dev +// static float blur10_std_dev +// static float blur11_std_dev +// static float blur12_std_dev +// static float blur17_std_dev +// static float blur25_std_dev +// static float blur31_std_dev +// static float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + float blur3_std_dev = 0.84931640625; + float blur4_std_dev = 0.84931640625; + float blur5_std_dev = 1.0595703125; + float blur6_std_dev = 1.06591796875; + float blur7_std_dev = 1.17041015625; + float blur8_std_dev = 1.1720703125; + float blur9_std_dev = 1.2259765625; + float blur10_std_dev = 1.21982421875; + float blur11_std_dev = 1.25361328125; + float blur12_std_dev = 1.2423828125; + float blur17_std_dev = 1.27783203125; + float blur25_std_dev = 1.2810546875; + float blur31_std_dev = 1.28125; + float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + float blur3_std_dev = 0.62666015625; + float blur4_std_dev = 0.66171875; + float blur5_std_dev = 0.9845703125; + float blur6_std_dev = 1.02626953125; + float blur7_std_dev = 1.36103515625; + float blur8_std_dev = 1.4080078125; + float blur9_std_dev = 1.7533203125; + float blur10_std_dev = 1.80478515625; + float blur11_std_dev = 2.15986328125; + float blur12_std_dev = 2.215234375; + float blur17_std_dev = 3.45535583496; + float blur25_std_dev = 5.3409576416; + float blur31_std_dev = 6.86488037109; + float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + float error_blurring = 0.5; +#endif + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" +//#include "quad-pixel-communication.h" +//#include "special-functions.h" + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (vec4/vec3/vec2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +vec4 erf6(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + vec4 one = vec4(1.0); + vec4 sign_x = sign(x); + vec4 t = one/(one + 0.47047*abs(x)); + vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec3 erf6(vec3 x) +{ + // vec3 version: + vec3 one = vec3(1.0); + vec3 sign_x = sign(x); + vec3 t = one/(one + 0.47047*abs(x)); + vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec2 erf6(vec2 x) +{ + // vec2 version: + vec2 one = vec2(1.0); + vec2 sign_x = sign(x); + vec2 t = one/(one + 0.47047*abs(x)); + vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(float x) +{ + // Float version: + float sign_x = sign(x); + float t = 1.0/(1.0 + 0.47047*abs(x)); + float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +vec4 erft(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +vec3 erft(vec3 x) +{ + // vec3 version: + return tanh(1.202760580 * x); +} + +vec2 erft(vec2 x) +{ + // vec2 version: + return tanh(1.202760580 * x); +} + +float erft(float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +vec4 erf(vec4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec3 erf(vec3 x) +{ + // vec3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +vec2 erf(vec2 x) +{ + // vec2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +float erf(float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +vec4 gamma_impl(vec4 s, vec4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + vec4 g = vec4(1.12906830989); + vec4 c0 = vec4(0.8109119309638332633713423362694399653724431); + vec4 c1 = vec4(0.4808354605142681877121661197951496120000040); + vec4 e = vec4(2.71828182845904523536028747135266249775724709); + vec4 sph = s + vec4(0.5); + vec4 lanczos_sum = c0 + c1/(s + vec4(1.0)); + vec4 base = (sph + g)/e; // or (s + g + vec4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec3 gamma_impl(vec3 s, vec3 s_inv) +{ + // vec3 version: + vec3 g = vec3(1.12906830989); + vec3 c0 = vec3(0.8109119309638332633713423362694399653724431); + vec3 c1 = vec3(0.4808354605142681877121661197951496120000040); + vec3 e = vec3(2.71828182845904523536028747135266249775724709); + vec3 sph = s + vec3(0.5); + vec3 lanczos_sum = c0 + c1/(s + vec3(1.0)); + vec3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec2 gamma_impl(vec2 s, vec2 s_inv) +{ + // vec2 version: + vec2 g = vec2(1.12906830989); + vec2 c0 = vec2(0.8109119309638332633713423362694399653724431); + vec2 c1 = vec2(0.4808354605142681877121661197951496120000040); + vec2 e = vec2(2.71828182845904523536028747135266249775724709); + vec2 sph = s + vec2(0.5); + vec2 lanczos_sum = c0 + c1/(s + vec2(1.0)); + vec2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(float s, float s_inv) +{ + // Float version: + float g = 1.12906830989; + float c0 = 0.8109119309638332633713423362694399653724431; + float c1 = 0.4808354605142681877121661197951496120000040; + float e = 2.71828182845904523536028747135266249775724709; + float sph = s + 0.5; + float lanczos_sum = c0 + c1/(s + 1.0); + float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +vec4 gamma(vec4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, vec4(1.0)/s); +} + +vec3 gamma(vec3 s) +{ + // vec3 version: + return gamma_impl(s, vec3(1.0)/s); +} + +vec2 gamma(vec2 s) +{ + // vec2 version: + return gamma_impl(s, vec2(1.0)/s); +} + +float gamma(float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + vec4 scale = pow(z, s); + vec4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + vec4 z_sq = z*z; + vec4 denom1 = s + vec4(1.0); + vec4 denom2 = 2.0*s + vec4(4.0); + vec4 denom3 = 6.0*s + vec4(18.0); + //vec4 denom4 = 24.0*s + vec4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv) +{ + // vec3 version: + vec3 scale = pow(z, s); + vec3 sum = s_inv; + vec3 z_sq = z*z; + vec3 denom1 = s + vec3(1.0); + vec3 denom2 = 2.0*s + vec3(4.0); + vec3 denom3 = 6.0*s + vec3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv) +{ + // vec2 version: + vec2 scale = pow(z, s); + vec2 sum = s_inv; + vec2 z_sq = z*z; + vec2 denom1 = s + vec2(1.0); + vec2 denom2 = 2.0*s + vec2(4.0); + vec2 denom3 = 6.0*s + vec2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(float s, float z, float s_inv) +{ + // Float version: + float scale = pow(z, s); + float sum = s_inv; + float z_sq = z*z; + float denom1 = s + 1.0; + float denom2 = 2.0*s + 4.0; + float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +vec4 uigamma_large_z_impl(vec4 s, vec4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = vec4('inf'); + // vec4 one = vec4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + vec4 numerator = pow(z, s) * exp(-z); + vec4 denom = vec4(7.0) + z - s; + denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom; + denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom; + denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom; + return numerator / denom; +} + +vec3 uigamma_large_z_impl(vec3 s, vec3 z) +{ + // vec3 version: + vec3 numerator = pow(z, s) * exp(-z); + vec3 denom = vec3(7.0) + z - s; + denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom; + denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom; + denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom; + return numerator / denom; +} + +vec2 uigamma_large_z_impl(vec2 s, vec2 z) +{ + // vec2 version: + vec2 numerator = pow(z, s) * exp(-z); + vec2 denom = vec2(7.0) + z - s; + denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom; + denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom; + denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(float s, float z) +{ + // Float version: + float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +vec4 normalized_ligamma_impl(vec4 s, vec4 z, + vec4 s_inv, vec4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + vec4 thresh = vec4(0.775075); + bvec4 z_is_large = greaterThan(z , thresh); + vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0); + vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + return large_z * vec4(z_size_check) + small_z * vec4(z_size_check); +} + +vec3 normalized_ligamma_impl(vec3 s, vec3 z, + vec3 s_inv, vec3 gamma_s_inv) +{ + // vec3 version: + vec3 thresh = vec3(0.775075); + bvec3 z_is_large = greaterThan(z , thresh); + vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0); + vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec3(z_size_check) + small_z * vec3(z_size_check); +} + +vec2 normalized_ligamma_impl(vec2 s, vec2 z, + vec2 s_inv, vec2 gamma_s_inv) +{ + // vec2 version: + vec2 thresh = vec2(0.775075); + bvec2 z_is_large = greaterThan(z , thresh); + vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0); + vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * vec2(z_size_check) + small_z * vec2(z_size_check); +} + +float normalized_ligamma_impl(float s, float z, + float s_inv, float gamma_s_inv) +{ + // Float version: + float thresh = 0.775075; + float z_size_check = 0.0; + if (z > thresh) z_size_check = 1.0; + float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_size_check) + small_z * float(z_size_check); +} + +// Normalized lower incomplete gamma function for small s: +vec4 normalized_ligamma(vec4 s, vec4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + vec4 s_inv = vec4(1.0)/s; + vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec3 normalized_ligamma(vec3 s, vec3 z) +{ + // vec3 version: + vec3 s_inv = vec3(1.0)/s; + vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +vec2 normalized_ligamma(vec2 s, vec2 z) +{ + // vec2 version: + vec2 s_inv = vec2(1.0)/s; + vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(float s, float z) +{ + // Float version: + float s_inv = 1.0/s; + float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +/////////////////////////////////// HELPERS ////////////////////////////////// + +vec4 uv2_to_uv4(vec2 tex_uv) +{ + // Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords: + return vec4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +float get_fast_gaussian_weight_sum_inv(float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + vec3 sum = vec3(0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w45 = w4 + w5; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + //float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + float w1_2 = w1 + w2; + float w3_4 = w3 + w4; + float w5_6 = w5 + w6; + float w7_8 = w7 + w8; + float w9_10 = w9 + w10; + float w11_12 = w11 + w12; + float w1_2_ratio = w2/w1_2; + float w3_4_ratio = w4/w3_4; + float w5_6_ratio = w6/w5_6; + float w7_8_ratio = w8/w7_8; + float w9_10_ratio = w10/w9_10; + float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float w5 = exp(-25.0 * denom_inv); + float w6 = exp(-36.0 * denom_inv); + float w7 = exp(-49.0 * denom_inv); + float w8 = exp(-64.0 * denom_inv); + float w9 = exp(-81.0 * denom_inv); + float w10 = exp(-100.0 * denom_inv); + float w11 = exp(-121.0 * denom_inv); + float w12 = exp(-144.0 * denom_inv); + float w13 = exp(-169.0 * denom_inv); + float w14 = exp(-196.0 * denom_inv); + float w15 = exp(-225.0 * denom_inv); + float w16 = exp(-256.0 * denom_inv); + float w17 = exp(-289.0 * denom_inv); + float w18 = exp(-324.0 * denom_inv); + float w19 = exp(-361.0 * denom_inv); + float w20 = exp(-400.0 * denom_inv); + float w21 = exp(-441.0 * denom_inv); + //float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w0_1 = w0 * 0.5 + w1; + float w2_3 = w2 + w3; + float w4_5 = w4 + w5; + float w6_7 = w6 + w7; + float w8_9 = w8 + w9; + float w10_11 = w10 + w11; + float w12_13 = w12 + w13; + float w14_15 = w14 + w15; + float w16_17 = w16 + w17; + float w18_19 = w18 + w19; + float w20_21 = w20 + w21; + float w0_1_ratio = w1/w0_1; + float w2_3_ratio = w3/w2_3; + float w4_5_ratio = w5/w4_5; + float w6_7_ratio = w7/w6_7; + float w8_9_ratio = w9/w8_9; + float w10_11_ratio = w11/w10_11; + float w12_13_ratio = w13/w12_13; + float w14_15_ratio = w15/w14_15; + float w16_17_ratio = w17/w16_17; + float w18_19_ratio = w19/w18_19; + float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + +vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + float w01 = w0 * 0.5 + w1; + float w23 = w2 + w3; + float w01_ratio = w1/w01; + float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + vec2 sample4_uv = tex_uv; + vec2 dx = vec2(dxdy.x, 0.0); + vec2 dy = vec2(0.0, dxdy.y); + vec2 sample1_uv = sample4_uv - dy; + vec2 sample7_uv = sample4_uv + dy; + vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + float w4 = 1.0; + float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + vec3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + +// Resizable one-pass blurs: +vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + float denom_inv = 0.5/(sigma*sigma); + float w0 = 1.0; + float w1 = exp(-1.0 * denom_inv); + float w2 = exp(-4.0 * denom_inv); + float w3 = exp(-9.0 * denom_inv); + float w4 = exp(-16.0 * denom_inv); + float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + float w12 = w1 + w2; + float w34 = w3 + w4; + float w12_ratio = w2/w12; + float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + vec3 sum = vec3(0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float w4off = exp(-16.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0); + vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio); + vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio); + vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2R1 = w3off; + float w2R2 = w4off; + float w3d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w3d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv); + float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv); + float w6d1 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv); + float w6d4 = exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2R1 + w2R2; + float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + float w5 = w4; + float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float w3off = exp(-9.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio); + vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio); + vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1abcd = 1.0; + float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); + float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv); + float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv); + float w1d4 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d3_3d2 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4_3d4 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); + float w4d1 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + float w4d2_4d3 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); + float w4d4 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = vec3(0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w1off = exp(-1.0 * denom_inv); + float w2off = exp(-4.0 * denom_inv); + float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); + vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + float w1R1 = w1off; + float w1R2 = w2off; + float w2d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); + float w2d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + float w0 = 1.0; + float w1 = w1R1 + w1R2; + float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + vec3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv, + vec2 dxdy, float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + float denom_inv = 0.5/(sigma*sigma); + float w0off = 1.0; + float w1off = exp(-1.0 * denom_inv); + float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + vec2 mirror_x = vec2(-1.0, 1.0); + vec2 mirror_y = vec2(1.0, -1.0); + vec2 mirror_xy = vec2(-1.0, -1.0); + vec2 dxdy_mirror_x = dxdy * mirror_x; + vec2 dxdy_mirror_y = dxdy * mirror_y; + vec2 dxdy_mirror_xy = dxdy * mirror_xy; + vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + +vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev); +} + +vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev); +} + +vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev); +} +vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev); +} + +vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev); +} +vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev); +} + +vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev); +} + +vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev); +} + +vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev); +} + +vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev); +} + +vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv, + vec2 dxdy) +{ + return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev); +} + +#endif // BLUR_FUNCTIONS_H + +#define InputSize sourceSize[0].xy +#define TextureSize sourceSize[0].xy +#define OutputSize targetSize.xy + +void main() { + gl_Position = position; + vTexCoord = texCoord; + // Get the uv sample distance between output pixels. Blurs are not generic + // Gaussian resizers, and correct blurs require: + // 1.) OutputSize == InputSize * 2^m, where m is an integer <= 0. + // 2.) mipmap_inputN = "true" for this pass in the preset if m != 0 + // 3.) filter_linearN = "true" except for 1x scale nearest neighbor blurs + // Gaussian resizers would upsize using the distance between input texels + // (not output pixels), but we avoid this and consistently blur at the + // destination size. Otherwise, combining statically calculated weights + // with bilinear sample exploitation would result in terrible artifacts. + vec2 dxdy_scale = InputSize/OutputSize; + vec2 dxdy = dxdy_scale/TextureSize; + // This blur is vertical-only, so zero out the horizontal offset: + blur_dxdy = vec2(0.0, dxdy.y); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/brightpass.fs b/shaders/CRT-Royale.shader/brightpass.fs new file mode 100644 index 00000000..29f27db8 --- /dev/null +++ b/shaders/CRT-Royale.shader/brightpass.fs @@ -0,0 +1,14481 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +in Vertex { + vec2 vTexCoord; + vec2 scanline_tex_uv; + vec2 blur3x3_tex_uv; + float bloom_sigma_runtime; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define MASKED_SCANLINEStexture source[0] +#define MASKED_SCANLINEStexture_size sourceSize[0].xy +#define MASKED_SCANLINESvideo_size sourceSize[0].xy +#define BLOOM_APPROXtexture source[5] +#define BLOOM_APPROXtexture_size sourceSize[5].xy +#define BLOOM_APPROXvideo_size sourceSize[5].xy + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +/////////////////////////////// END VERTEX-INCLUDES ///////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +//#include "bloom-functions.h" + +//////////////////////////// BEGIN BLOOM-FUNCTIONS /////////////////////////// + +#ifndef BLOOM_FUNCTIONS_H +#define BLOOM_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These utility functions and constants help several passes determine the +// size and center texel weight of the phosphor bloom in a uniform manner. + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// We need to calculate the correct blur sigma using some .cgp constants: +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// output_size < video_size. +// 4.) output_size == video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (video_size/output_size)/texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(video_size/output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, output_size.x), [0, output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, output_size.x), [0, output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(video_size/output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +/////////////////////////////// BLOOM CONSTANTS ////////////////////////////// + +// Compute constants with manual inlines of the functions below: +static const float bloom_diff_thresh = 1.0/256.0; + + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +inline float get_absolute_scale_blur_sigma(const float thresh) +{ + // Requires: 1.) min_expected_triads must be a global float. The number + // of horizontal phosphor triads in the final image must be + // >= min_allowed_viewport_triads.x for realistic results. + // 2.) bloom_approx_scale_x must be a global float equal to the + // absolute horizontal scale of BLOOM_APPROX. + // 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x + // should be <= 1.1658025090 to keep the final result < + // 0.62666015625 (the largest sigma ensuring the largest + // unused texel weight stays < 1.0/256.0 for a 3x3 blur). + // 4.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum Gaussian sigma that will blur the pass + // output as much as it would have taken to blur away + // bloom_approx_scale_x horizontal phosphor triads. + // Description: + // BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd + // use the same blur sigma as the actual phosphor bloom and scale it down + // to the current resolution with (bloom_approx_scale_x/viewport_size_x), but + // we don't know the viewport size in this pass. Instead, we'll blur as + // much as it would take to blur away min_allowed_viewport_triads.x. This + // will blur "more than necessary" if the user actually uses more triads, + // but that's not terrible either, because blurring a constant fraction of + // the viewport may better resemble a true optical bloom anyway (since the + // viewport will generally be about the same fraction of each player's + // field of view, regardless of screen size and resolution). + // Assume an extremely large viewport size for asymptotic results. + return bloom_approx_scale_x/max_viewport_size_x * + get_min_sigma_to_blur_triad( + max_viewport_size_x/min_allowed_viewport_triads.x, thresh); +} + +inline float get_center_weight(const float sigma) +{ + // Given a Gaussian blur sigma, get the blur weight for the center texel. + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return get_fast_gaussian_weight_sum_inv(sigma); + #else + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + // Note: If the implementation uses a smaller blur than the max allowed, + // the worst case scenario is that the center weight will be overestimated, + // so we'll put a bit more energy into the brightpass...no huge deal. + // Then again, if the implementation uses a larger blur than the max + // "allowed" because of dynamic branching, the center weight could be + // underestimated, which is more of a problem...consider always using + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // 43x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + + w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + // 31x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + + w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + // 25x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + // 17x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + #else + // 9x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + const float center_weight = weight_sum_inv * weight_sum_inv; + return center_weight; + #endif +} + +inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // If sigma is static, we can safely branch and use the smallest blur + // that's big enough. Ignore #define hints, because we'll only use a + // large blur if we actually need it, and the branches cost nothing. + #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #else + // It's still worth branching if the profile supports dynamic branches: + // It's much faster than using a hugely excessive blur, but each branch + // eats ~1% FPS. + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #endif + #endif + // Failed optimization notes: + // I originally created a same-size mipmapped 5-tap separable blur10 that + // could handle any sigma by reaching into lower mip levels. It was + // as fast as blur25fast for runtime sigmas and a tad faster than + // blur31fast for static sigmas, but mipmapping two viewport-size passes + // ate 10% of FPS across all codepaths, so it wasn't worth it. + #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + if(sigma <= blur9_std_dev) + { + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur17_std_dev) + { + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur25_std_dev) + { + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur31_std_dev) + { + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + } + else + { + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + } + #else + // If we can't afford to branch, we can only guess at what blur + // size we need. Therefore, use the largest blur allowed. + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + #else + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE +} + +inline float get_bloom_approx_sigma(const float output_size_x_runtime, + const float estimated_viewport_size_x) +{ + // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x. + // This is included for dynamic codepaths just in case the + // following two globals are incorrect: + // 2.) bloom_approx_size_x_for_skip should == the same + // if PHOSPHOR_BLOOM_FAKE is #defined + // 3.) bloom_approx_size_x should == the same otherwise + // Returns: For gaussian4x4, return a dynamic small bloom sigma that's + // as close to optimal as possible given available information. + // For blur3x3, return the a static small bloom sigma that + // works well for typical cases. Otherwise, we're using simple + // bilinear filtering, so use static calculations. + // Assume the default static value. This is a compromise that ensures + // typical triads are blurred, even if unusually large ones aren't. + static const float mask_num_triads_static = + max(min_allowed_viewport_triads.x, mask_num_triads_desired_static); + const float mask_num_triads_from_size = + estimated_viewport_size_x/mask_triad_size_desired; + const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x, + lerp(mask_num_triads_from_size, mask_num_triads_desired, + mask_specify_num_triads)); + // Assume an extremely large viewport size for asymptotic results: + static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // Use the runtime num triads and output size: + const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_runtime; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_runtime/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // account for the Gaussian scanline sigma from the last pass too. + // The bloom will be too wide horizontally but tall enough vertically. + return length(float2(bloom_approx_sigma, beam_max_sigma)); + } + else // 3x3 blur resize (the bilinear resize doesn't need a sigma) + { + // We're either using blur3x3 or bilinear filtering. The biggest + // reason to choose blur3x3 is to avoid dynamic weights, so use a + // static calculation. + #ifdef PHOSPHOR_BLOOM_FAKE + static const float output_size_x_static = + bloom_approx_size_x_for_fake; + #else + static const float output_size_x_static = bloom_approx_size_x; + #endif + static const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_static; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_static/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // try accounting for the Gaussian scanline sigma from the last pass + // too; use the static default value: + return length(float2(bloom_approx_sigma, beam_max_sigma_static)); + } +} + +inline float get_final_bloom_sigma(const float bloom_sigma_runtime) +{ + // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's + // optimal for the [known] triad size. + // 2.) Call this from a fragment shader (not a vertex shader), + // or blurring with static sigmas won't be constant-folded. + // Returns: Return the optimistic static sigma if the triad size is + // known at compile time. Otherwise return the optimal runtime + // sigma (10% slower) or an implementation-specific compromise + // between an optimistic or pessimistic static sigma. + // Notes: Call this from the fragment shader, NOT the vertex shader, + // so static sigmas can be constant-folded! + const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad( + mask_triad_size_desired_static, bloom_diff_thresh); + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return bloom_sigma_runtime; + #else + // Overblurring looks as bad as underblurring, so assume average-size + // triads, not worst-case huge triads: + return bloom_sigma_optimistic; + #endif +} + + +#endif // BLOOM_FUNCTIONS_H + +//////////////////////////// END BLOOM-FUNCTIONS /////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// output_size < video_size. +// 4.) output_size == video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (video_size/output_size)/texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(video_size/output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, output_size.x), [0, output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, output_size.x), [0, output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(video_size/output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +void main() { + // Sample the masked scanlines: + const float3 intensity_dim = + tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb; + // Get the full intensity, including auto-undimming, and mask compensation: + const float auto_dim_factor = levels_autodim_temp; + const float undim_factor = 1.0/auto_dim_factor; + const float mask_amplify = get_mask_amplify(); + const float3 intensity = intensity_dim * undim_factor * mask_amplify * + levels_contrast; + + // Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines + // would look like, so we can estimate how much energy we'll receive from + // blooming neighbors: + const float3 phosphor_blur_approx = levels_contrast * tex2D_linearize( + BLOOM_APPROXtexture, blur3x3_tex_uv).rgb; + + // Compute the blur weight for the center texel and the maximum energy we + // expect to receive from neighbors: + const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime); + const float center_weight = get_center_weight(bloom_sigma); + const float3 max_area_contribution_approx = + max(float3(0.0, 0.0, 0.0), phosphor_blur_approx - center_weight * intensity); + // Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0), + // because it actually gets better results (on top of being very simple), + // but adjust all intensities for the user's desired underestimate factor: + const float3 area_contrib_underestimate = + bloom_underestimate_levels * max_area_contribution_approx; + const float3 intensity_underestimate = + bloom_underestimate_levels * intensity; + // Calculate the blur_ratio, the ratio of intensity we want to blur: + #ifdef BRIGHTPASS_AREA_BASED + // This area-based version changes blur_ratio more smoothly and blurs + // more, clipping less but offering less phosphor differentiation: + const float3 phosphor_blur_underestimate = bloom_underestimate_levels * + phosphor_blur_approx; + const float3 soft_intensity = max(intensity_underestimate, + phosphor_blur_underestimate * mask_amplify); + const float3 blur_ratio_temp = + ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) / + soft_intensity - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0); + #else + const float3 blur_ratio_temp = + ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) / + intensity_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0); + #endif + const float3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0); + // Calculate the brightpass based on the auto-dimmed, unamplified, masked + // scanlines, encode if necessary, and return! + const float3 brightpass = intensity_dim * + lerp(blur_ratio, float3(1.0, 1.0, 1.0), bloom_excess); + FragColor = encode_output(float4(brightpass, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/brightpass.vs b/shaders/CRT-Royale.shader/brightpass.vs new file mode 100644 index 00000000..2d02d72a --- /dev/null +++ b/shaders/CRT-Royale.shader/brightpass.vs @@ -0,0 +1,6551 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 scanline_tex_uv; + vec2 blur3x3_tex_uv; + float bloom_sigma_runtime; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define MASKED_SCANLINEStexture source[0] +#define MASKED_SCANLINEStexture_size sourceSize[0].xy +#define MASKED_SCANLINESvideo_size sourceSize[0].xy +#define BLOOM_APPROXtexture source[3] +#define BLOOM_APPROXtexture_size sourceSize[3].xy +#define BLOOM_APPROXvideo_size sourceSize[3].xy + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +/////////////////////////////// END VERTEX-INCLUDES ///////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + +// copied from bloom-functions.h +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +void main() { + gl_Position = position; + vTexCoord = texCoord * 1.00001; + float2 tex_uv = vTexCoord.xy; + // Our various input textures use different coords: + float2 video_uv = tex_uv * texture_size/video_size; + //video_uv = video_uv; + scanline_tex_uv = video_uv * MASKED_SCANLINESvideo_size / + MASKED_SCANLINEStexture_size; + blur3x3_tex_uv = video_uv;// * BLOOM_APPROXvideo_size / BLOOM_APPROXtexture_size; + + // Calculate a runtime bloom_sigma in case it's needed: + const float mask_tile_size_x = get_resized_mask_tile_size( + output_size, output_size * mask_resize_viewport_scale, false).x; + bloom_sigma_runtime = get_min_sigma_to_blur_triad( + mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs new file mode 100644 index 00000000..c89e4671 --- /dev/null +++ b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs @@ -0,0 +1,4748 @@ +#version 150 + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; +uniform int phase; + +in Vertex { + vec2 vTexCoord; + vec2 uv_step; + float interlaced; +}; + +out vec4 FragColor; + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#ifdef GL_ES +#ifdef GL_FRAGMENT_PRECISION_HIGH +precision highp float; +#else +precision mediump float; +#endif +#define COMPAT_PRECISION mediump +#else +#define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 +#define COMPAT_VARYING in +#define COMPAT_TEXTURE texture +#else +#define COMPAT_VARYING varying +#define FragColor gl_FragColor +#define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +#define FIRST_PASS +#define SIMULATE_CRT_ON_LCD + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +void main() { + const float2 tex_uv = vTexCoord.xy; + // Linearize the input based on CRT gamma and bob interlaced fields. + // Bobbing ensures we can immediately blur without getting artifacts. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + if(bool(interlace_detect)) + { + // Sample the current line and an average of the previous/next line; + // tex2D_linearize will decode CRT gamma. Don't bother branching: + const float2 v_step = float2(0.0, uv_step.y); + const float3 curr_line = tex2D_linearize( + input_texture, tex_uv).rgb; + const float3 last_line = tex2D_linearize( + input_texture, tex_uv - v_step).rgb; + const float3 next_line = tex2D_linearize( + input_texture, tex_uv + v_step).rgb; + const float3 interpolated_line = 0.5 * (last_line + next_line); + // If we're interlacing, determine which field curr_line is in: + const float modulus = interlaced + 1.0; + const float field_offset = + fmod(frame_count + interlace_bff, modulus); + const float curr_line_texel = tex_uv.y * texture_size.y; + // Use under_half to fix a rounding bug around exact texel locations. + const float line_num_last = floor(curr_line_texel - under_half); + const float wrong_field = fmod(line_num_last + field_offset, modulus); + // Select the correct color, and output the result: + const float3 color = lerp(curr_line, interpolated_line, wrong_field); + FragColor = encode_output(float4(color, 1.0)); + } + else + { + FragColor = encode_output(tex2D_linearize(input_texture, tex_uv)); + } +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs new file mode 100644 index 00000000..12b93534 --- /dev/null +++ b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs @@ -0,0 +1,4704 @@ +#version 150 + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 uv_step; + float interlaced; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#ifdef GL_ES +#ifdef GL_FRAGMENT_PRECISION_HIGH +precision highp float; +#else +precision mediump float; +#endif +#define COMPAT_PRECISION mediump +#else +#define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 +#define COMPAT_VARYING in +#define COMPAT_TEXTURE texture +#else +#define COMPAT_VARYING varying +#define FragColor gl_FragColor +#define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +#define FIRST_PASS +#define SIMULATE_CRT_ON_LCD + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +void main() { + gl_Position = position; + vTexCoord = texCoord; + uv_step = float2(1.0)/texture_size; + + // Detect interlacing: 1.0 = true, 0.0 = false. + const float2 _video_size = video_size; + interlaced = float(is_interlaced(_video_size.y)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/geometry-aa-last-pass.fs b/shaders/CRT-Royale.shader/geometry-aa-last-pass.fs new file mode 100644 index 00000000..87d1b721 --- /dev/null +++ b/shaders/CRT-Royale.shader/geometry-aa-last-pass.fs @@ -0,0 +1,5279 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; +uniform int phase; + +in Vertex { + vec2 vTexCoord; + vec2 tex_uv; + vec4 video_and_texture_size_inv; + vec2 output_size_inv; + vec3 eye_pos_local; + vec4 geom_aspect_and_overscan; + vec3 global_to_local_row0; + vec3 global_to_local_row1; + vec3 global_to_local_row2; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 1.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(x,y) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#define LAST_PASS +#define SIMULATE_CRT_ON_LCD + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +#ifndef RUNTIME_GEOMETRY_TILT + // Create a local-to-global rotation matrix for the CRT's coordinate frame + // and its global-to-local inverse. See the vertex shader for details. + // It's faster to compute these statically if possible. + static const float2 sin_tilt = sin(geom_tilt_angle_static); + static const float2 cos_tilt = cos(geom_tilt_angle_static); + static const float3x3 geom_local_to_global_static = float3x3( + cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, + 0.0, cos_tilt.y, -sin_tilt.y, + -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); + static const float3x3 geom_global_to_local_static = float3x3( + cos_tilt.x, 0.0, -sin_tilt.x, + sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x, + cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x); +#endif + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "tex2Dantialias.h" + +///////////////////////// BEGIN TEX2DANTIALIAS ///////////////////////// + +#ifndef TEX2DANTIALIAS_H +#define TEX2DANTIALIAS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides antialiased and subpixel-aware tex2D lookups. +// Requires: All functions share these requirements: +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe- +// space offsets to texture uv offsets. You can get this with: +// const float2 duv_dx = ddx(tex_uv); +// const float2 duv_dy = ddy(tex_uv); +// const float2x2 pixel_to_tex_uv = float2x2( +// duv_dx.x, duv_dy.x, +// duv_dx.y, duv_dy.y); +// This is left to the user in case the current Cg profile +// doesn't support ddx()/ddy(). Ideally, the user could find +// calculate a distorted tangent-space mapping analytically. +// If not, a simple flat mapping can be obtained with: +// const float2 xy_to_uv_scale = output_size * +// video_size/texture_size; +// const float2x2 pixel_to_tex_uv = float2x2( +// xy_to_uv_scale.x, 0.0, +// 0.0, xy_to_uv_scale.y); +// Optional: To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and: +// 1.) Set an antialiasing level: +// static const float aa_level = {0 (none), +// 1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24} +// 2.) Set a filter type: +// static const float aa_filter = { +// 0 (Box, Separable), 1 (Box, Cylindrical), +// 2 (Tent, Separable), 3 (Tent, Cylindrical) +// 4 (Gaussian, Separable), 5 (Gaussian, Cylindrical) +// 6 (Cubic, Separable), 7 (Cubic, Cylindrical) +// 8 (Lanczos Sinc, Separable), +// 9 (Lanczos Jinc, Cylindrical)} +// If the input is unknown, a separable box filter is used. +// Note: Lanczos Jinc is terrible for sparse sampling, and +// using aa_axis_importance (see below) defeats the purpose. +// 3.) Mirror the sample pattern on odd frames? +// static const bool aa_temporal = {true, false] +// This helps rotational invariance but can look "fluttery." +// The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override +// (all of) the following default parameters with static or uniform +// constants (or an accessor function for subpixel offsets): +// 1.) Cubic parameters: +// static const float aa_cubic_c = 0.5; +// See http://www.imagemagick.org/Usage/filter/#mitchell +// 2.) Gaussian parameters: +// static const float aa_gauss_sigma = +// 0.5/aa_pixel_diameter; +// 3.) Set subpixel offsets. This requires an accessor function +// for compatibility with scalar runtime shader Return +// a float2 pixel offset in [-0.5, 0.5] for the red subpixel: +// float2 get_aa_subpixel_r_offset() +// The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to +// override (all of) the following default static values. However, +// the file's structure requires them to be declared static const: +// 1.) static const float aa_lanczos_lobes = 3.0; +// 2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter; +// Note the default tent/Gaussian support radii may appear +// arbitrary, but extensive testing found them nearly optimal +// for tough cases like strong distortion at low AA levels. +// (The Gaussian default is only best for practical gauss_sigma +// values; much larger gauss_sigmas ironically prefer slightly +// smaller support given sparse sampling, and vice versa.) +// 3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter; +// 4.) static const float2 aa_xy_axis_importance: +// The sparse N-queens sampling grid interacts poorly with +// negative-lobed 2D filters. However, if aliasing is much +// stronger in one direction (e.g. horizontally with a phosphor +// mask), it can be useful to downplay sample offsets along the +// other axis. The support radius in each direction scales with +// aa_xy_axis_importance down to a minimum of 0.5 (box support), +// after which point only the offsets used for calculating +// weights continue to scale downward. This works as follows: +// If aa_xy_axis_importance = float2(1.0, 1.0/support_radius), +// the vertical support radius will drop to 1.0, and we'll just +// filter vertical offsets with the first filter lobe, while +// horizontal offsets go through the full multi-lobe filter. +// If aa_xy_axis_importance = float2(1.0, 0.0), the vertical +// support radius will drop to box support, and the vertical +// offsets will be ignored entirely (essentially giving us a +// box filter vertically). The former is potentially smoother +// (but less predictable) and the default behavior of Lanczos +// jinc, whereas the latter is sharper and the default behavior +// of cubics and Lanczos sinc. +// 5.) static const float aa_pixel_diameter: You can expand the +// pixel diameter to e.g. sqrt(2.0), which may be a better +// support range for cylindrical filters (they don't +// currently discard out-of-circle samples though). +// Finally, there are two miscellaneous options: +// 1.) If you want to antialias a manually tiled texture, you can +// #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to +// fix incompatibilities with anisotropic filtering. This is +// slower, and the Cg profile must support tex2Dlod(). +// 2.) If aa_cubic_c is a runtime uniform, you can #define +// RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per +// fragment instead of at the usage site (which is used by +// default, because it enables static evaluation). +// Description: +// Each antialiased lookup follows these steps: +// 1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5] +// pixels, spanning the diameter of a rectangular box filter. +// 2.) Scale these offsets by the support diameter of the user's chosen filter. +// 3.) Using these pixel offsets from the pixel center, compute the offsets to +// predefined subpixel locations. +// 4.) Compute filter weights based on subpixel offsets. +// Much of that can often be done at compile-time. At runtime: +// 1.) Project pixel-space offsets into uv-space with a matrix multiplication +// to get the uv offsets for each sample. Rectangular pixels have a +// diameter of 1.0. Circular pixels are not currently supported, but they +// might be better with a diameter of sqrt(2.0) to ensure there are no gaps +// between them. +// 2.) Load, weight, and sum samples. +// We use a sparse bilinear sampling grid, so there are two major implications: +// 1.) We can directly project the pixel-space support box into uv-space even +// if we're upsizing. This wouldn't be the case for nearest neighbor, +// where we'd have to expand the uv-space diameter to at least the support +// size to ensure sufficient filter support. In our case, this allows us +// to treat upsizing the same as downsizing and use static weighting. :) +// 2.) For decent results, negative-lobed filters must be computed based on +// separable weights, not radial distances, because the sparse sampling +// makes no guarantees about radial distributions. Even then, it's much +// better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g. +// Lanczos2 horizontally and a box filter vertically. This is mainly due +// to the sparse N-queens sampling and a statistically enormous positive or +// negative covariance between horizontal and vertical weights. +// +// Design Decision Comments: +// "aa_temporal" mirrors the sample pattern on odd frames along the axis that +// keeps subpixel weights constant. This helps with rotational invariance, but +// it can cause distracting fluctuations, and horizontal and vertical edges +// will look the same. Using a different pattern on a shifted grid would +// exploit temporal AA better, but it would require a dynamic branch or a lot +// of conditional moves, so it's prohibitively slow for the minor benefit. + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#ifndef ANTIALIAS_OVERRIDE_BASICS + // The following settings must be static constants: + static const float aa_level = 12.0; + static const float aa_filter = 0.0; + static const bool aa_temporal = false; +#endif + +#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS + // Users may override these parameters, but the file structure requires + // them to be static constants; see the descriptions above. + static const float aa_pixel_diameter = 1.0; + static const float aa_lanczos_lobes = 3.0; + static const float aa_gauss_support = 1.0 / aa_pixel_diameter; + static const float aa_tent_support = 1.0 / aa_pixel_diameter; + + // If we're using a negative-lobed filter, default to using it horizontally + // only, and use only the first lobe vertically or a box filter, over a + // correspondingly smaller range. This compensates for the sparse sampling + // grid's typically large positive/negative x/y covariance. + static const float2 aa_xy_axis_importance = + aa_filter < 5.5 ? float2(1.0) : // Box, tent, Gaussian + aa_filter < 8.5 ? float2(1.0, 0.0) : // Cubic and Lanczos sinc + aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) : // Lanczos jinc + float2(1.0); // Default to box +#endif + +#ifndef ANTIALIAS_OVERRIDE_PARAMETERS + // Users may override these values with their own uniform or static consts. + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c = 0.5; + static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter; + // Users may override the subpixel offset accessor function with their own. + // A function is used for compatibility with scalar runtime shader + inline float2 get_aa_subpixel_r_offset() + { + return float2(0.0, 0.0); + } +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../../../../include/gamma-management.h" + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +static const float aa_box_support = 0.5; +static const float aa_cubic_support = 2.0; + + +//////////////////////////// GLOBAL NON-CONSTANTS //////////////////////////// + +// We'll want to define these only once per fragment at most. +#ifdef RUNTIME_ANTIALIAS_WEIGHTS + float aa_cubic_b; + float cubic_branch1_x3_coeff; + float cubic_branch1_x2_coeff; + float cubic_branch1_x0_coeff; + float cubic_branch2_x3_coeff; + float cubic_branch2_x2_coeff; + float cubic_branch2_x1_coeff; + float cubic_branch2_x0_coeff; +#endif + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +void assign_aa_cubic_constants() +{ + // Compute cubic coefficients on demand at runtime, and save them to global + // uniforms. The B parameter is computed from C, because "Keys cubics" + // with B = 1 - 2C are considered the highest quality. + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + if(aa_filter > 5.5 && aa_filter < 7.5) + { + aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; + cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; + cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; + } + #endif +} + +inline float4 get_subpixel_support_diam_and_final_axis_importance() +{ + // Statically select the base support radius: + static const float base_support_radius = + aa_filter < 1.5 ? aa_box_support : + aa_filter < 3.5 ? aa_tent_support : + aa_filter < 5.5 ? aa_gauss_support : + aa_filter < 7.5 ? aa_cubic_support : + aa_filter < 9.5 ? aa_lanczos_lobes : + aa_box_support; // Default to box + // Expand the filter support for subpixel filtering. + const float2 subpixel_support_radius_raw = + float2(base_support_radius) + abs(get_aa_subpixel_r_offset()); + if(aa_filter < 1.5) + { + // Ignore aa_xy_axis_importance for box filtering. + const float2 subpixel_support_diam = + 2.0 * subpixel_support_radius_raw; + const float2 final_axis_importance = float2(1.0); + return float4(subpixel_support_diam, final_axis_importance); + } + else + { + // Scale the support window by aa_xy_axis_importance, but don't narrow + // it further than box support. This allows decent vertical AA without + // messing up horizontal weights or using something silly like Lanczos4 + // horizontally with a huge vertical average over an 8-pixel radius. + const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support), + subpixel_support_radius_raw * aa_xy_axis_importance); + // Adjust aa_xy_axis_importance to compensate for what's already done: + const float2 final_axis_importance = aa_xy_axis_importance * + subpixel_support_radius_raw/subpixel_support_radius; + const float2 subpixel_support_diam = 2.0 * subpixel_support_radius; + return float4(subpixel_support_diam, final_axis_importance); + } +} + + +/////////////////////////// FILTER WEIGHT FUNCTIONS ////////////////////////// + +inline float eval_box_filter(const float dist) +{ + return float(abs(dist) <= aa_box_support); +} + +inline float eval_separable_box_filter(const float2 offset) +{ + return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support)))); +} + +inline float eval_tent_filter(const float dist) +{ + return clamp((aa_tent_support - dist)/ + aa_tent_support, 0.0, 1.0); +} + +inline float eval_gaussian_filter(const float dist) +{ + return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma)); +} + +inline float eval_cubic_filter(const float dist) +{ + // Compute coefficients like assign_aa_cubic_constants(), but statically. + #ifndef RUNTIME_ANTIALIAS_WEIGHTS + // When runtime weights are used, these values are instead written to + // global uniforms at the beginning of each tex2Daa* call. + const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; + const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; + const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; + #endif + const float abs_dist = abs(dist); + // Compute the cubic based on the Horner's method formula in: + // http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf + return (abs_dist < 1.0 ? + (cubic_branch1_x3_coeff*abs_dist + + cubic_branch1_x2_coeff)*abs_dist*abs_dist + + cubic_branch1_x0_coeff : + abs_dist < 2.0 ? + ((cubic_branch2_x3_coeff*abs_dist + + cubic_branch2_x2_coeff)*abs_dist + + cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff : + 0.0)/6.0; +} + +inline float eval_separable_cubic_filter(const float2 offset) +{ + // This is faster than using a specific float2 version: + return eval_cubic_filter(offset.x) * + eval_cubic_filter(offset.y); +} + +inline float2 eval_sinc_filter(const float2 offset) +{ + // It's faster to let the caller handle the zero case, or at least it + // was when I used macros and the shader preset took a full minute to load. + const float2 pi_offset = pi * offset; + return sin(pi_offset)/pi_offset; +} + +inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe) +{ + // Note: For sparse sampling, you really need to pick an axis to use + // Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)). + const float2 offset = FIX_ZERO(offset_unsafe); + const float2 xy_weights = eval_sinc_filter(offset) * + eval_sinc_filter(offset/aa_lanczos_lobes); + return xy_weights.x * xy_weights.y; +} + +inline float eval_jinc_filter_unorm(const float x) +{ + // This is a Jinc approximation for x in [0, 45). We'll use x in range + // [0, 4*pi) or so. There are faster/closer approximations based on + // piecewise cubics from [0, 45) and asymptotic approximations beyond that, + // but this has a maximum absolute error < 1/512, and it's simpler/faster + // for shaders...not that it's all that useful for sparse sampling anyway. + const float point3845_x = 0.38448566093564*x; + const float exp_term = exp(-(point3845_x*point3845_x)); + const float point8154_plus_x = 0.815362332840791 + x; + const float cos_term = cos(point8154_plus_x); + return ( + 0.0264727330997042*min(x, 6.83134964622778) + + 0.680823557250528*exp_term + + -0.0597255978950933*min(7.41043194481873, x)*cos_term / + (point8154_plus_x + 0.0646074538634482*(x*x) + + cos(x)*max(exp_term, cos(x) + cos_term)) - + 0.180837503591406); +} + +inline float eval_jinc_filter(const float dist) +{ + return eval_jinc_filter_unorm(pi * dist); +} + +inline float eval_lanczos_jinc_filter(const float dist) +{ + return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes); +} + + +inline float3 eval_unorm_rgb_weights(const float2 offset, + const float2 final_axis_importance) +{ + // Requires: 1.) final_axis_impportance must be computed according to + // get_subpixel_support_diam_and_final_axis_importance(). + // 2.) aa_filter must be a global constant. + // 3.) offset must be an xy pixel offset in the range: + // ([-subpixel_support_diameter.x/2, + // subpixel_support_diameter.x/2], + // [-subpixel_support_diameter.y/2, + // subpixel_support_diameter.y/2]) + // Returns: Sample weights at R/G/B destination subpixels for the + // given xy pixel offset. + const float2 offset_g = offset * final_axis_importance; + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 offset_r = offset_g - aa_r_offset * final_axis_importance; + const float2 offset_b = offset_g + aa_r_offset * final_axis_importance; + // Statically select a filter: + if(aa_filter < 0.5) + { + return float3(eval_separable_box_filter(offset_r), + eval_separable_box_filter(offset_g), + eval_separable_box_filter(offset_b)); + } + else if(aa_filter < 1.5) + { + return float3(eval_box_filter(length(offset_r)), + eval_box_filter(length(offset_g)), + eval_box_filter(length(offset_b))); + } + else if(aa_filter < 2.5) + { + return float3( + eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y), + eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y), + eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y)); + } + else if(aa_filter < 3.5) + { + return float3(eval_tent_filter(length(offset_r)), + eval_tent_filter(length(offset_g)), + eval_tent_filter(length(offset_b))); + } + else if(aa_filter < 4.5) + { + return float3( + eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y), + eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y), + eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y)); + } + else if(aa_filter < 5.5) + { + return float3(eval_gaussian_filter(length(offset_r)), + eval_gaussian_filter(length(offset_g)), + eval_gaussian_filter(length(offset_b))); + } + else if(aa_filter < 6.5) + { + return float3( + eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y), + eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y), + eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y)); + } + else if(aa_filter < 7.5) + { + return float3(eval_cubic_filter(length(offset_r)), + eval_cubic_filter(length(offset_g)), + eval_cubic_filter(length(offset_b))); + } + else if(aa_filter < 8.5) + { + return float3(eval_separable_lanczos_sinc_filter(offset_r), + eval_separable_lanczos_sinc_filter(offset_g), + eval_separable_lanczos_sinc_filter(offset_b)); + } + else if(aa_filter < 9.5) + { + return float3(eval_lanczos_jinc_filter(length(offset_r)), + eval_lanczos_jinc_filter(length(offset_g)), + eval_lanczos_jinc_filter(length(offset_b))); + } + else + { + // Default to a box, because Lanczos Jinc is so bad. ;) + return float3(eval_separable_box_filter(offset_r), + eval_separable_box_filter(offset_g), + eval_separable_box_filter(offset_b)); + } +} + + +////////////////////////////// HELPER FUNCTIONS ////////////////////////////// + +inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s) +{ + // If we're manually tiling a texture, anisotropic filtering can get + // confused. This is one workaround: + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + // TODO: Use tex2Dlod_linearize with a calculated mip level. + return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0)); + #else + return tex2D_linearize(samp, s); + #endif +} + +inline float2 get_frame_sign(const float frame) +{ + if(aa_temporal) + { + // Mirror the sampling pattern for odd frames in a direction that + // lets us keep the same subpixel sample weights: + const float frame_odd = float(fmod(frame, 2.0) > 0.5); + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0))); + return mirror; + } + else + { + return float2(1.0, 1.0); + } +} + + +///////////////////////// ANTIALIASED TEXTURE LOOKUPS //////////////////////// + +float3 tex2Daa_subpixel_weights_only(const sampler2D tex, + const float2 tex_uv, const float2x2 pixel_to_tex_uv) +{ + // This function is unlike the others: Just perform a single independent + // lookup for each subpixel. It may be very aliased. + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset); + const float color_g = tex2D_linearize(tex, tex_uv).g; + const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r; + const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b; + return float3(color_r, color_g, color_b); +} + +// The tex2Daa* functions compile very slowly due to all the macros and +// compile-time math, so only include the ones we'll actually use! +float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use an RGMS4 pattern (4-queens): + // . . Q . : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4 + // Q . . . : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4 + // . . . Q : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4 + // . Q . . : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4 + // Static screenspace sample offsets (compute some implicitly): + static const float grid_size = 4.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = w1.bgr; + const float3 w3 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + + w2 * sample2 + w3 * sample3); +} + +float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 5-queens pattern: + // . Q . . . : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5 + // . . . . Q : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5 + // . . Q . . : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5 + // Q . . . . : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5 + // . . . Q . : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5 + // Static screenspace sample offsets (compute some implicitly): + static const float grid_size = 5.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = w1.bgr; + const float3 w4 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + + w2 * sample2 + w3 * sample3 + w4 * sample4); +} + +float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 6-queens pattern with a stronger horizontal + // than vertical slant: + // . . . . Q . : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6 + // . . Q . . . : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6 + // Q . . . . . : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6 + // . . . . . Q : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6 + // . . . Q . . : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6 + // . Q . . . . : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6 + // Static screenspace sample offsets (compute some implicitly): + static const float grid_size = 6.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = w2.bgr; + const float3 w4 = w1.bgr; + const float3 w5 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 + + w3 * sample3 + w4 * sample4 + w5 * sample5); +} + +float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 7-queens pattern with a queen in the center: + // . Q . . . . . : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7 + // . . . . Q . . : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7 + // Q . . . . . . : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7 + // . . . Q . . . : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7 + // . . . . . . Q : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7 + // . . Q . . . . : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7 + // . . . . . Q . : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7 + static const float grid_size = 7.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = w2.bgr; + const float3 w5 = w1.bgr; + const float3 w6 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2; + const float3 w_sum = half_sum + half_sum.bgr + w3; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6); +} + +float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 8-queens pattern. + // . . Q . . . . . : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8 + // . . . . Q . . . : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8 + // . Q . . . . . . : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8 + // . . . . . . . Q : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8 + // Q . . . . . . . : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8 + // . . . . . . Q . : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8 + // . . . Q . . . . : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8 + // . . . . . Q . . : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8 + static const float grid_size = 8.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = w3.bgr; + const float3 w5 = w2.bgr; + const float3 w6 = w1.bgr; + const float3 w7 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, and mirror on odd frames if directed: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7); +} + +float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 12-superqueens pattern where no 3 points are + // exactly collinear. + // . . . Q . . . . . . . . : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12 + // . . . . . . . . . Q . . : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12 + // . . . . . . Q . . . . . : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12 + // . Q . . . . . . . . . . : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12 + // . . . . . . . . . . . Q : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12 + // . . . . Q . . . . . . . : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12 + // . . . . . . . Q . . . . : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12 + // Q . . . . . . . . . . . : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12 + // . . . . . . . . . . Q . : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12 + // . . . . . Q . . . . . . : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12 + // . . Q . . . . . . . . . : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12 + // . . . . . . . . Q . . . : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12 + static const float grid_size = 12.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = w5.bgr; + const float3 w7 = w4.bgr; + const float3 w8 = w3.bgr; + const float3 w9 = w2.bgr; + const float3 w10 = w1.bgr; + const float3 w11 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/w_sum; + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11); +} + +float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 16-superqueens pattern where no 3 points are + // exactly collinear. + // . . Q . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16 + // . . . . . . . . . Q . . . . . . : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16 + // . . . . . . . . . . . . Q . . . : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16 + // . . . . Q . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16 + // . . . . . . . . Q . . . . . . . : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16 + // . . . . . . . . . . . . . . Q . : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16 + // Q . . . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16 + // . . . . . . . . . . Q . . . . . : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16 + // . . . . . Q . . . . . . . . . . : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16 + // . . . . . . . . . . . . . . . Q : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16 + // . Q . . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16 + // . . . . . . . Q . . . . . . . . : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16 + // . . . . . . . . . . . Q . . . . : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16 + // . . . Q . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16 + // . . . . . . Q . . . . . . . . . : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16 + // . . . . . . . . . . . . . Q . . : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16 + static const float grid_size = 16.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = w7.bgr; + const float3 w9 = w6.bgr; + const float3 w10 = w5.bgr; + const float3 w11 = w4.bgr; + const float3 w12 = w3.bgr; + const float3 w13 = w2.bgr; + const float3 w14 = w1.bgr; + const float3 w15 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); +} + +float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 20-superqueens pattern where no 3 points are + // exactly collinear and superqueens have a squared attack radius of 13. + // . . . . . . . Q . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20 + // . . . . . . . . . . . . . . . . Q . . . : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20 + // . . . . . . . . . . . Q . . . . . . . . : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20 + // . Q . . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20 + // . . . . . Q . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20 + // . . . . . . . . . . . . . . . Q . . . . : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20 + // . . . . . . . . . . Q . . . . . . . . . : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20 + // . . . . . . . . . . . . . . . . . . . Q : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20 + // . . Q . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20 + // . . . . . . Q . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20 + // . . . . . . . . . . . . . Q . . . . . . : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20 + // . . . . . . . . . . . . . . . . . Q . . : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20 + // Q . . . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20 + // . . . . . . . . . Q . . . . . . . . . . : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20 + // . . . . Q . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20 + // . . . . . . . . . . . . . . Q . . . . . : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20 + // . . . . . . . . . . . . . . . . . . Q . : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20 + // . . . . . . . . Q . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20 + // . . . Q . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20 + // . . . . . . . . . . . . Q . . . . . . . : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20 + static const float grid_size = 20.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step; + const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step; + const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const float3 w10 = w9.bgr; + const float3 w11 = w8.bgr; + const float3 w12 = w7.bgr; + const float3 w13 = w6.bgr; + const float3 w14 = w5.bgr; + const float3 w15 = w4.bgr; + const float3 w16 = w3.bgr; + const float3 w17 = w2.bgr; + const float3 w18 = w1.bgr; + const float3 w19 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 + + w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19); +} + +float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 24-superqueens pattern where no 3 points are + // exactly collinear and superqueens have a squared attack radius of 13. + // . . . . . . Q . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24 + // . . . . . . . . . . . . . . . . Q . . . . . . . : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24 + // . . . . . . . . . . Q . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24 + // . . . . . . . . . . . . . . . . . . . . . Q . . : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24 + // . . . . . Q . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24 + // . . . . . . . . . . . . . . . Q . . . . . . . . : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24 + // . Q . . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24 + // . . . . . . . . . . . Q . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24 + // . . . . . . . . . . . . . . . . . . . Q . . . . : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24 + // . . . . . . . . . . . . . . . . . . . . . . . Q : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24 + // . . . Q . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24 + // . . . . . . . . . . . . . . Q . . . . . . . . . : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24 + // . . . . . . . . . Q . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24 + // . . . . . . . . . . . . . . . . . . . . Q . . . : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24 + // Q . . . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24 + // . . . . Q . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24 + // . . . . . . . . . . . . Q . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24 + // . . . . . . . . . . . . . . . . . . . . . . Q . : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24 + // . . . . . . . . Q . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24 + // . . . . . . . . . . . . . . . . . . Q . . . . . : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24 + // . . Q . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24 + // . . . . . . . . . . . . . Q . . . . . . . . . . : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24 + // . . . . . . . Q . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24 + // . . . . . . . . . . . . . . . . . Q . . . . . . : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24 + static const float grid_size = 24.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step; + const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step; + const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step; + const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step; + const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance); + const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance); + const float3 w12 = w11.bgr; + const float3 w13 = w10.bgr; + const float3 w14 = w9.bgr; + const float3 w15 = w8.bgr; + const float3 w16 = w7.bgr; + const float3 w17 = w6.bgr; + const float3 w18 = w5.bgr; + const float3 w19 = w4.bgr; + const float3 w20 = w3.bgr; + const float3 w21 = w2.bgr; + const float3 w22 = w1.bgr; + const float3 w23 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + + w5 + w6 + w7 + w8 + w9 + w10 + w11; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign); + const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; + const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 + + w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 + + w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23); +} + +float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Sample on a regular 4x4 grid. This is mainly for testing. + static const float grid_size = 4.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample: + const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + // (We can't exploit vertical or horizontal symmetry due to uncertain + // subpixel offsets. We could fix that by rotating xy offsets with the + // subpixel structure, but...no.) + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = w7.bgr; + const float3 w9 = w6.bgr; + const float3 w10 = w5.bgr; + const float3 w11 = w4.bgr; + const float3 w12 = w3.bgr; + const float3 w13 = w2.bgr; + const float3 w14 = w1.bgr; + const float3 w15 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, taking advantage of row alignment: + const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0)); + const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y)); + const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y); + const float2 sample0_uv = tex_uv + uv_offset0; + const float2 sample4_uv = sample0_uv + uv_step_y; + const float2 sample8_uv = sample0_uv + uv_step_y * 2.0; + const float2 sample12_uv = sample0_uv + uv_step_y * 3.0; + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); +} + +float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // This function is for testing only: Use an NxN grid with dynamic weights. + static const int grid_size = 8; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0; + const float2 filter_space_offset_step = + subpixel_support_diameter/float2(grid_size); + const float2 sample0_filter_space_offset = + -grid_radius_in_samples * filter_space_offset_step; + // Compute xy sample offsets and subpixel weights: + float3 weights[64]; //originally grid_size * grid_size + float3 weight_sum = float3(0.0, 0.0, 0.0); + for(int i = 0; i < grid_size; ++i) + { + for(int j = 0; j < grid_size; ++j) + { + // Weights based on xy distances: + const float2 offset = sample0_filter_space_offset + + float2(j, i) * filter_space_offset_step; + const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance); + weights[i*grid_size + j] = weight; + weight_sum += weight; + } + } + // Get uv offset vectors along x and y directions: + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv, + float2(filter_space_offset_step.x, 0.0)); + const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv, + float2(0.0, filter_space_offset_step.y)); + // Get a starting sample location: + const float2 sample0_uv_offset = -grid_radius_in_samples * + (uv_offset_step_x + uv_offset_step_y); + const float2 sample0_uv = tex_uv + sample0_uv_offset; + // Load, weight, and sum [linearized] samples: + float3 sum = float3(0.0, 0.0, 0.0); + const float3 weight_sum_inv = float3(1.0)/weight_sum; + for(int i = 0; i < grid_size; ++i) + { + const float2 row_i_first_sample_uv = + sample0_uv + i * uv_offset_step_y; + for(int j = 0; j < grid_size; ++j) + { + const float2 sample_uv = + row_i_first_sample_uv + j * uv_offset_step_x; + sum += weights[i*grid_size + j] * + tex2Daa_tiled_linearize(tex, sample_uv).rgb; + } + } + return sum * weight_sum_inv; +} + + +/////////////////////// ANTIALIASING CODEPATH SELECTION ////////////////////// + +inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ +#ifdef DEBUG + return tex2Daa_subpixel_weights_only( + tex, tex_uv, pixel_to_tex_uv); +#else + // Statically switch between antialiasing modes/levels: + return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb : + (aa_level < 3.5) ? tex2Daa_subpixel_weights_only( + tex, tex_uv, pixel_to_tex_uv) : + (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 254.5) ? tex2Daa_debug_16x_regular( + tex, tex_uv, pixel_to_tex_uv, frame) : + tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame); +#endif +} + + +#endif // TEX2DANTIALIAS_H + +///////////////////////// END TEX2DANTIALIAS ///////////////////////// + +//#include "geometry-functions.h" + +///////////////////////// BEGIN GEOMETRY-FUNCTIONS ///////////////////////// + +#ifndef GEOMETRY_FUNCTIONS_H +#define GEOMETRY_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// already included elsewhere +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" +//#include "bind-shader-h" + + +//////////////////////////// MACROS AND CONSTANTS //////////////////////////// + +// Curvature-related constants: +#define MAX_POINT_CLOUD_SIZE 9 + + +///////////////////////////// CURVATURE FUNCTIONS ///////////////////////////// + +float2 quadratic_solve(const float a, const float b_over_2, const float c) +{ + // Requires: 1.) a, b, and c are quadratic formula coefficients + // 2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out) + // 3.) b_over_2 must be guaranteed < 0.0 (avoids a branch) + // Returns: Returns float2(first_solution, discriminant), so the caller + // can choose how to handle the "no intersection" case. The + // Kahan or Citardauq formula is used for numerical robustness. + const float discriminant = b_over_2*b_over_2 - a*c; + const float solution0 = c/(-b_over_2 + sqrt(discriminant)); + return float2(solution0, discriminant); +} + +float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec) +{ + // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's + // local coordinate frame (eye_pos_vec is a position, i.e. + // a vector from the origin to the eye/camera) + // 2.) geom_radius is a global containing the sphere's radius + // Returns: Cast a ray of direction view_vec from eye_pos_vec at a + // sphere of radius geom_radius, and return the distance to + // the first intersection in units of length(view_vec). + // http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection + // Quadratic formula coefficients (b_over_2 is guaranteed negative): + const float a = dot(view_vec, view_vec); + const float b_over_2 = dot(view_vec, eye_pos_vec); // * 2.0 factored out + const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius; + return quadratic_solve(a, b_over_2, c); +} + +float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec) +{ + // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's + // local coordinate frame (eye_pos_vec is a position, i.e. + // a vector from the origin to the eye/camera) + // 2.) geom_radius is a global containing the cylinder's radius + // Returns: Cast a ray of direction view_vec from eye_pos_vec at a + // cylinder of radius geom_radius, and return the distance to + // the first intersection in units of length(view_vec). The + // derivation of the coefficients is in Christer Ericson's + // Real-Time Collision Detection, p. 195-196, and this version + // uses LaGrange's identity to reduce operations. + // Arbitrary "cylinder top" reference point for an infinite cylinder: + const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0); + const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0); + const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec; + const float3 axis_x_view = cross(cylinder_axis_vec, view_vec); + const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec); + // Quadratic formula coefficients (b_over_2 is guaranteed negative): + const float a = dot(axis_x_view, axis_x_view); + const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view); + const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) - + geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec); + return quadratic_solve(a, b_over_2, c); +} + +float2 cylinder_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) +{ + // Requires: An xyz intersection position on a cylinder. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: Define square_uv.x to be the signed arc length in xz-space, + // and define square_uv.y = -intersection_pos_local.y (+v = -y). + // Start with a numerically robust arc length calculation. + const float angle_from_image_center = atan2(intersection_pos_local.x, + intersection_pos_local.z); + const float signed_arc_len = angle_from_image_center * geom_radius; + // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide + // by the aspect ratio to stretch the mapping appropriately: + const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y); + const float2 video_uv = square_uv / geom_aspect; + return video_uv; +} + +float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a cylinder. This is the + // inverse of cylinder_xyz_to_uv(). + // Expand video_uv by the aspect ratio to get proportionate x/y lengths, + // then calculate an xyz position for the cylindrical mapping above. + const float2 square_uv = video_uv * geom_aspect; + const float arc_len = square_uv.x; + const float angle_from_image_center = arc_len / geom_radius; + const float x_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = cos(angle_from_image_center) * geom_radius; + // Or: z = sqrt(geom_radius**2 - x**2) + // Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle) + const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos); + return intersection_pos_local; +} + +float2 sphere_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) +{ + // Requires: An xyz intersection position on a sphere. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: First define square_uv.x/square_uv.y == + // intersection_pos_local.x/intersection_pos_local.y. Then, + // length(square_uv) is the arc length from the image center + // at (0.0, 0.0, geom_radius) along the tangent great circle. + // Credit for this mapping goes to cgwg: I never managed to + // understand his code, but he told me his mapping was based on + // great circle distances when I asked him about it, which + // informed this very similar (almost identical) mapping. + // Start with a numerically robust arc length calculation between the ray- + // sphere intersection point and the image center using a method posted by + // Roger Stafford on comp.soft-sys.matlab: + // https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ + const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius); + const float cp_len = + length(cross(intersection_pos_local, image_center_pos_local)); + const float dp = dot(intersection_pos_local, image_center_pos_local); + const float angle_from_image_center = atan2(cp_len, dp); + const float arc_len = angle_from_image_center * geom_radius; + // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide + // by the aspect ratio to stretch the mapping appropriately: + const float2 square_uv_unit = normalize(float2(intersection_pos_local.x, + -intersection_pos_local.y)); + const float2 square_uv = arc_len * square_uv_unit; + const float2 video_uv = square_uv / geom_aspect; + return video_uv; +} + +float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a sphere. This is the + // inverse of sphere_xyz_to_uv(). + // Expand video_uv by the aspect ratio to get proportionate x/y lengths, + // then calculate an xyz position for the spherical mapping above. + const float2 square_uv = video_uv * geom_aspect; + // Using length or sqrt here butchers the framerate on my 8800GTS if + // this function is called too many times, and so does taking the max + // component of square_uv/square_uv_unit (program length threshold?). + //float arc_len = length(square_uv); + const float2 square_uv_unit = normalize(square_uv); + const float arc_len = square_uv.y/square_uv_unit.y; + const float angle_from_image_center = arc_len / geom_radius; + const float xy_dist_from_sphere_center = + sin(angle_from_image_center) * geom_radius; + //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len)); + const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit; + const float z_pos = cos(angle_from_image_center) * geom_radius; + const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos); + return intersection_pos_local; +} + +float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) +{ + // Requires: An xyz intersection position on a cylinder. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: Define square_uv.x to be the signed arc length in xz-space, + // and define square_uv.y == signed arc length in yz-space. + // See cylinder_xyz_to_uv() for implementation details (very similar). + const float2 angle_from_image_center = atan2( + float2(intersection_pos_local.x, -intersection_pos_local.y), + intersection_pos_local.zz); + const float2 signed_arc_len = angle_from_image_center * geom_radius; + const float2 video_uv = signed_arc_len / geom_aspect; + return video_uv; +} + +float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a sphere. This is the + // inverse of sphere_alt_xyz_to_uv(). + // See cylinder_uv_to_xyz() for implementation details (very similar). + const float2 square_uv = video_uv * geom_aspect; + const float2 arc_len = square_uv; + const float2 angle_from_image_center = arc_len / geom_radius; + const float2 xy_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos)); + return float3(xy_pos.x, -xy_pos.y, z_pos); +} + +inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local, + const float geom_mode) +{ + return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) : + intersect_cylinder(view_vec_local, eye_pos_local); +} + +inline float2 xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect, const float geom_mode) +{ + return geom_mode < 1.5 ? + sphere_xyz_to_uv(intersection_pos_local, geom_aspect) : + geom_mode < 2.5 ? + sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) : + cylinder_xyz_to_uv(intersection_pos_local, geom_aspect); +} + +inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect, + const float geom_mode) +{ + return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) : + geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) : + cylinder_uv_to_xyz(uv, geom_aspect); +} + +float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local, + const float2 geom_aspect, const float geom_mode, out float3 intersection_pos) +{ + // Get the intersection point on the primitive, given an eye position + // and view vector already in its local coordinate frame: + const float2 intersect_dist_and_discriminant = intersect(view_vec_local, + eye_pos_local, geom_mode); + const float3 intersection_pos_local = eye_pos_local + + view_vec_local * intersect_dist_and_discriminant.x; + // Save the intersection position to an output parameter: + intersection_pos = intersection_pos_local; + // Transform into uv coords, but give out-of-range coords if the + // view ray doesn't intersect the primitive in the first place: + return intersect_dist_and_discriminant.y > 0.005 ? + xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0); +} + +float3 get_ideal_global_eye_pos_for_points(float3 eye_pos, + const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE], + const int num_points) +{ + // Requires: Parameters: + // 1.) Starting eye_pos is a global 3D position at which the + // camera contains all points in global_coords[] in its FOV + // 2.) geom_aspect = get_aspect_vector( + // output_size.x / output_size.y); + // 3.) global_coords is a point cloud containing global xyz + // coords of extreme points on the simulated CRT screen. + // Globals: + // 1.) geom_view_dist must be > 0.0. It controls the "near + // plane" used to interpret flat_video_uv as a view + // vector, which controls the field of view (FOV). + // Eyespace coordinate frame: +x = right, +y = up, +z = back + // Returns: Return an eye position at which the point cloud spans as + // much of the screen as possible (given the FOV controlled by + // geom_view_dist) without being cropped or sheared. + // Algorithm: + // 1.) Move the eye laterally to a point which attempts to maximize the + // the amount we can move forward without clipping the CRT screen. + // 2.) Move forward by as much as possible without clipping the CRT. + // Get the allowed movement range by solving for the eye_pos offsets + // that result in each point being projected to a screen edge/corner in + // pseudo-normalized device coords (where xy ranges from [-0.5, 0.5] + // and z = eyespace z): + // pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)* + // geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z); + // Notes: + // The field of view is controlled by geom_view_dist's magnitude relative to + // the view vector's x and y components: + // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect + // view_vec.z = -geom_view_dist + // But for the purposes of perspective divide, it should be considered: + // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist + // view_vec.z = -1.0 + static const int max_centering_iters = 1; // Keep for easy testing. + for(int iter = 0; iter < max_centering_iters; iter++) + { + // 0.) Get the eyespace coordinates of our point cloud: + float3 eyespace_coords[MAX_POINT_CLOUD_SIZE]; + for(int i = 0; i < num_points; i++) + { + eyespace_coords[i] = global_coords[i] - eye_pos; + } + // 1a.)For each point, find out how far we can move eye_pos in each + // lateral direction without the point clipping the frustum. + // Eyespace +y = up, screenspace +y = down, so flip y after + // applying the eyespace offset (on the way to "clip space"). + // Solve for two offsets per point based on: + // (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5) + // (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5) + // offset_ul and offset_dr represent the farthest we can move the + // eye_pos up-left and down-right. Save the min of all offset_dr's + // and the max of all offset_ul's (since it's negative). + float abs_radius = abs(geom_radius); // In case anyone gets ideas. ;) + float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius); + float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius); + for(int i = 0; i < num_points; i++) + { + static const float2 flipy = float2(1.0, -1.0); + float3 eyespace_xyz = eyespace_coords[i]; + float2 offset_dr = eyespace_xyz.xy - float2(-0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + float2 offset_ul = eyespace_xyz.xy - float2(0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + offset_dr_min = min(offset_dr_min, offset_dr); + offset_ul_max = max(offset_ul_max, offset_ul); + } + // 1b.)Update eye_pos: Adding the average of offset_ul_max and + // offset_dr_min gives it equal leeway on the top vs. bottom + // and left vs. right. Recalculate eyespace_coords accordingly. + float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min); + eye_pos.xy += center_offset; + for(int i = 0; i < num_points; i++) + { + eyespace_coords[i] = global_coords[i] - eye_pos; + } + // 2a.)For each point, find out how far we can move eye_pos forward + // without the point clipping the frustum. Flip the y + // direction in advance (matters for a later step, not here). + // Solve for four offsets per point based on: + // eyespace_xyz_flipy.x * geom_view_dist / + // (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5 + // eyespace_xyz_flipy.y * geom_view_dist / + // (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5 + // eyespace_xyz_flipy.x * geom_view_dist / + // (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5 + // eyespace_xyz_flipy.y * geom_view_dist / + // (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5 + // We'll vectorize the actual computation. Take the maximum of + // these four for a single offset, and continue taking the max + // for every point (use max because offset.z is negative). + float offset_z_max = -10.0 * geom_radius * geom_view_dist; + for(int i = 0; i < num_points; i++) + { + float3 eyespace_xyz_flipy = eyespace_coords[i] * + float3(1.0, -1.0, 1.0); + float4 offset_zzzz = eyespace_xyz_flipy.zzzz + + (eyespace_xyz_flipy.xyxy * geom_view_dist) / + (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect)); + // Ignore offsets that push positive x/y values to opposite + // boundaries, and vice versa, and don't let the camera move + // past a point in the dead center of the screen: + offset_z_max = (eyespace_xyz_flipy.x < 0.0) ? + max(offset_z_max, offset_zzzz.x) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.y < 0.0) ? + max(offset_z_max, offset_zzzz.y) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.x > 0.0) ? + max(offset_z_max, offset_zzzz.z) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.y > 0.0) ? + max(offset_z_max, offset_zzzz.w) : offset_z_max; + offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z); + } + // 2b.)Update eye_pos: Add the maximum (smallest negative) z offset. + eye_pos.z += offset_z_max; + } + return eye_pos; +} + +float3 get_ideal_global_eye_pos(const float3x3 local_to_global, + const float2 geom_aspect, const float geom_mode) +{ + // Start with an initial eye_pos that includes the entire primitive + // (sphere or cylinder) in its field-of-view: + const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist); + const float3 low_view = high_view * float3(1.0, -1.0, 1.0); + const float len_sq = dot(high_view, high_view); + const float fov = abs(acos(dot(high_view, low_view)/len_sq)); + // Trigonometry/similar triangles say distance = geom_radius/sin(fov/2): + const float eye_z_spherical = geom_radius/sin(fov*0.5); + const float3 eye_pos = geom_mode < 2.5 ? + float3(0.0, 0.0, eye_z_spherical) : + float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical)); + + // Get global xyz coords of extreme sample points on the simulated CRT + // screen. Start with the center, edge centers, and corners of the + // video image. We can't ignore backfacing points: They're occluded + // by closer points on the primitive, but they may NOT be occluded by + // the convex hull of the remaining samples (i.e. the remaining convex + // hull might not envelope points that do occlude a back-facing point.) + static const int num_points = MAX_POINT_CLOUD_SIZE; + float3 global_coords[MAX_POINT_CLOUD_SIZE]; + global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode)); + global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode)); + global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode)); + global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode)); + global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode)); + global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode)); + global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode)); + global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode)); + global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode)); + // Adding more inner image points could help in extreme cases, but too many + // points will kille the framerate. For safety, default to the initial + // eye_pos if any z coords are negative: + float num_negative_z_coords = 0.0; + for(int i = 0; i < num_points; i++) + { + num_negative_z_coords += float(global_coords[0].z < 0.0); + } + // Outsource the optimized eye_pos calculation: + return num_negative_z_coords > 0.5 ? eye_pos : + get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect, + global_coords, num_points); +} + +float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local, + const float3 eye_pos_local, const float3 view_vec_global, + const float3 intersection_pos_local, const float3 normal, + const float2 output_size_inv) +{ + // Requires: See get_curved_video_uv_coords_and_tangent_matrix for + // descriptions of each parameter. + // Returns: Return a transformation matrix from 2D pixel-space vectors + // (where (+1.0, +1.0) is a vector to one pixel down-right, + // i.e. same directionality as uv texels) to 3D object-space + // vectors in the CRT's local coordinate frame (right-handed) + // ***which are tangent to the CRT surface at the intersection + // position.*** (Basically, we want to convert pixel-space + // vectors to 3D vectors along the CRT's surface, for later + // conversion to uv vectors.) + // Shorthand inputs: + const float3 pos = intersection_pos_local; + const float3 eye_pos = eye_pos_local; + // Get a piecewise-linear matrix transforming from "pixelspace" offset + // vectors (1.0 = one pixel) to object space vectors in the tangent + // plane (faster than finding 3 view-object intersections). + // 1.) Get the local view vecs for the pixels to the right and down: + const float3 view_vec_right_global = view_vec_global + + float3(output_size_inv.x, 0.0, 0.0); + const float3 view_vec_down_global = view_vec_global + + float3(0.0, -output_size_inv.y, 0.0); + const float3 view_vec_right_local = + mul(global_to_local, view_vec_right_global); + const float3 view_vec_down_local = + mul(global_to_local, view_vec_down_global); + // 2.) Using the true intersection point, intersect the neighboring + // view vectors with the tangent plane: + const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal)); + const float3 right_pos = eye_pos + (intersection_vec_dot_normal / + dot(view_vec_right_local, normal))*view_vec_right_local; + const float3 down_pos = eye_pos + (intersection_vec_dot_normal / + dot(view_vec_down_local, normal))*view_vec_down_local; + // 3.) Subtract the original intersection pos from its neighbors; the + // resulting vectors are object-space vectors tangent to the plane. + // These vectors are the object-space transformations of (1.0, 0.0) + // and (0.0, 1.0) pixel offsets, so they form the first two basis + // vectors of a pixelspace to object space transformation. This + // transformation is 2D to 3D, so use (0, 0, 0) for the third vector. + const float3 object_right_vec = right_pos - pos; + const float3 object_down_vec = down_pos - pos; + const float3x3 pixel_to_object = float3x3( + object_right_vec.x, object_down_vec.x, 0.0, + object_right_vec.y, object_down_vec.y, 0.0, + object_right_vec.z, object_down_vec.z, 0.0); + return pixel_to_object; +} + +float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local, + const float3 normal, const float2 geom_aspect, const float geom_mode) +{ + // Requires: See get_curved_video_uv_coords_and_tangent_matrix for + // descriptions of each parameter. + // Returns: Return a transformation matrix from 3D object-space vectors + // in the CRT's local coordinate frame (right-handed, +y = up) + // to 2D video_uv vectors (+v = down). + // Description: + // The TBN matrix formed by the [tangent, bitangent, normal] basis + // vectors transforms ordinary vectors from tangent->object space. + // The cotangent matrix formed by the [cotangent, cobitangent, normal] + // basis vectors transforms normal vectors (covectors) from + // tangent->object space. It's the inverse-transpose of the TBN matrix. + // We want the inverse of the TBN matrix (transpose of the cotangent + // matrix), which transforms ordinary vectors from object->tangent space. + // Start by calculating the relevant basis vectors in accordance with + // Christian Schüler's blog post "Followup: Normal Mapping Without + // Precomputed Tangents": http://www.thetenthplanet.de/archives/1180 + // With our particular uv mapping, the scale of the u and v directions + // is determined entirely by the aspect ratio for cylindrical and ordinary + // spherical mappings, and so tangent and bitangent lengths are also + // determined by it (the alternate mapping is more complex). Therefore, we + // must ensure appropriate cotangent and cobitangent lengths as well. + // Base these off the uv<=>xyz mappings for each primitive. + const float3 pos = intersection_pos_local; + static const float3 x_vec = float3(1.0, 0.0, 0.0); + static const float3 y_vec = float3(0.0, 1.0, 0.0); + // The tangent and bitangent vectors correspond with increasing u and v, + // respectively. Mathematically we'd base the cotangent/cobitangent on + // those, but we'll compute the cotangent/cobitangent directly when we can. + float3 cotangent_unscaled, cobitangent_unscaled; + // geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE. + if(geom_mode < 1.5) + { + // Sphere: + // tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x + // bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y + // inv_determinant = 1.0/length(cross(bitangent, tangent)) + // cotangent = cross(normal, bitangent) * inv_determinant + // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant + // cobitangent = cross(tangent, normal) * inv_determinant + // == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant + // Simplified (scale by inv_determinant below): + cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y; + cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x; + } + else if(geom_mode < 2.5) + { + // Sphere, alternate mapping: + // This mapping works a bit like the cylindrical mapping in two + // directions, which makes the lengths and directions more complex. + // Unfortunately, I can't find much of a shortcut: + const float3 tangent = normalize( + cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x; + const float3 bitangent = normalize( + cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y; + cotangent_unscaled = cross(normal, bitangent); + cobitangent_unscaled = cross(tangent, normal); + } + else + { + // Cylinder: + // tangent = normalize(cross(y_vec, normal)) * geom_aspect.x; + // bitangent = float3(0.0, -geom_aspect.y, 0.0); + // inv_determinant = 1.0/length(cross(bitangent, tangent)) + // cotangent = cross(normal, bitangent) * inv_determinant + // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant + // cobitangent = cross(tangent, normal) * inv_determinant + // == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant + cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y; + cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0); + } + const float3 computed_normal = + cross(cobitangent_unscaled, cotangent_unscaled); + const float inv_determinant = rsqrt(dot(computed_normal, computed_normal)); + const float3 cotangent = cotangent_unscaled * inv_determinant; + const float3 cobitangent = cobitangent_unscaled * inv_determinant; + // The [cotangent, cobitangent, normal] column vecs form the cotangent + // frame, i.e. the inverse-transpose TBN matrix. Get its transpose: + const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal); + return object_to_tangent; +} + +float2 get_curved_video_uv_coords_and_tangent_matrix( + const float2 flat_video_uv, const float3 eye_pos_local, + const float2 output_size_inv, const float2 geom_aspect, + const float geom_mode, const float3x3 global_to_local, + out float2x2 pixel_to_tangent_video_uv) +{ + // Requires: Parameters: + // 1.) flat_video_uv coords are in range [0.0, 1.0], where + // (0.0, 0.0) is the top-left corner of the screen and + // (1.0, 1.0) is the bottom-right corner. + // 2.) eye_pos_local is the 3D camera position in the simulated + // CRT's local coordinate frame. For best results, it must + // be computed based on the same geom_view_dist used here. + // 3.) output_size_inv = float2(1.0)/output_size + // 4.) geom_aspect = get_aspect_vector( + // output_size.x / output_size.y); + // 5.) geom_mode is a static or runtime mode setting: + // 0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder + // 6.) global_to_local is a 3x3 matrix transforming (ordinary) + // worldspace vectors to the CRT's local coordinate frame + // Globals: + // 1.) geom_view_dist must be > 0.0. It controls the "near + // plane" used to interpret flat_video_uv as a view + // vector, which controls the field of view (FOV). + // Returns: Return final uv coords in [0.0, 1.0], and return a pixel- + // space to video_uv tangent-space matrix in the out parameter. + // (This matrix assumes pixel-space +y = down, like +v = down.) + // We'll transform flat_video_uv into a view vector, project + // the view vector from the camera/eye, intersect with a sphere + // or cylinder representing the simulated CRT, and convert the + // intersection position into final uv coords and a local + // transformation matrix. + // First get the 3D view vector (geom_aspect and geom_view_dist are globals): + // 1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5) + // correspond to the top-left/bottom-right output screen corners. + // 2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen- + // space 2D aspect correction. We'll reapply it in uv-space. + // 3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y + // is up in 3D worldspace (enforce a right-handed system). + // 4.) The view vector z controls the "near plane" distance and FOV. + // For the effect of "looking through a window" at a CRT, it should be + // set equal to the user's distance from their physical screen, in + // units of the viewport's physical diagonal size. + const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect; + const float3 view_vec_global = + float3(view_uv.x, -view_uv.y, -geom_view_dist); + // Transform the view vector into the CRT's local coordinate frame, convert + // to video_uv coords, and get the local 3D intersection position: + const float3 view_vec_local = mul(global_to_local, view_vec_global); + float3 pos; + const float2 centered_uv = view_vec_to_uv( + view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos); + const float2 video_uv = centered_uv + float2(0.5); + // Get a pixel-to-tangent-video-uv matrix. The caller could deal with + // all but one of these cases, but that would be more complicated. + #ifdef DRIVERS_ALLOW_DERIVATIVES + // Derivatives obtain a matrix very fast, but the direction of pixel- + // space +y seems to depend on the pass. Enforce the correct direction + // on a best-effort basis (but it shouldn't matter for antialiasing). + const float2 duv_dx = ddx(video_uv); + const float2 duv_dy = ddy(video_uv); + #ifdef LAST_PASS + pixel_to_tangent_video_uv = float2x2( + duv_dx.x, duv_dy.x, + -duv_dx.y, -duv_dy.y); + #else + pixel_to_tangent_video_uv = float2x2( + duv_dx.x, duv_dy.x, + duv_dx.y, duv_dy.y); + #endif + #else + // Manually define a transformation matrix. We'll assume pixel-space + // +y = down, just like +v = down. + if(geom_force_correct_tangent_matrix) + { + // Get the surface normal based on the local intersection position: + const float3 normal_base = geom_mode < 2.5 ? pos : + float3(pos.x, 0.0, pos.z); + const float3 normal = normalize(normal_base); + // Get pixel-to-object and object-to-tangent matrices and combine + // them into a 2x2 pixel-to-tangent matrix for video_uv offsets: + const float3x3 pixel_to_object = get_pixel_to_object_matrix( + global_to_local, eye_pos_local, view_vec_global, pos, normal, + output_size_inv); + const float3x3 object_to_tangent = get_object_to_tangent_matrix( + pos, normal, geom_aspect, geom_mode); + const float3x3 pixel_to_tangent3x3 = + mul(object_to_tangent, pixel_to_object); + pixel_to_tangent_video_uv = float2x2( + pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11); //TODO/FIXME: needs to correct for column-major?? + } + else + { + // Ignore curvature, and just consider flat scaling. The + // difference is only apparent with strong curvature: + pixel_to_tangent_video_uv = float2x2( + output_size_inv.x, 0.0, 0.0, output_size_inv.y); + } + #endif + return video_uv; +} + +float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect) +{ + // COPYRIGHT NOTE FOR THIS FUNCTION: + // Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey + // This function uses an algorithm first coded in several of cgwg's GPL- + // licensed lines in crt-geom-curved.cg and its ancestors. The line + // between algorithm and code is nearly indistinguishable here, so it's + // unclear whether I could even release this project under a non-GPL + // license with this function included. + + // Calculate border_dim_factor from the proximity to uv-space image + // borders; geom_aspect/border_size/border/darkness/border_compress are globals: + const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) * + geom_aspect; + const float2 border_penetration = + max(float2(border_size) - edge_dists, float2(0.0)); + const float penetration_ratio = length(border_penetration)/border_size; + const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0); + const float border_dim_factor = + pow(border_escape_ratio, border_darkness) * max(1.0, border_compress); + return min(border_dim_factor, 1.0); +} + + + +#endif // GEOMETRY_FUNCTIONS_H + +///////////////////////// END GEOMETRY-FUNCTIONS ///////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +float2x2 mul_scale(float2 scale, float2x2 matrix) +{ + //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y); + //return mul(scale_matrix, matrix); + float4 intermed = float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy; + return float2x2(intermed.x, intermed.y, intermed.z, intermed.w); +} + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + // Localize some parameters: + const float2 geom_aspect = geom_aspect_and_overscan.xy; + const float2 geom_overscan = geom_aspect_and_overscan.zw; + const float2 video_size_inv = video_and_texture_size_inv.xy; + const float2 texture_size_inv = video_and_texture_size_inv.zw; + //const float2 output_size_inv = output_size_inv; + #ifdef RUNTIME_GEOMETRY_TILT + const float3x3 global_to_local = float3x3(global_to_local_row0, + global_to_local_row1, global_to_local_row2); + #else + static const float3x3 global_to_local = geom_global_to_local_static; + #endif + #ifdef RUNTIME_GEOMETRY_MODE + const float geom_mode = geom_mode_runtime; + #else + static const float geom_mode = geom_mode_static; + #endif + + // Get flat and curved texture coords for the current fragment point sample + // and a pixel_to_tangent_video_uv matrix for transforming pixel offsets: + // video_uv = relative position in video frame, mapped to [0.0, 1.0] range + // tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range + const float2 flat_video_uv = tex_uv * (texture_size * video_size_inv); + float2x2 pixel_to_video_uv; + float2 video_uv_no_geom_overscan; + if(geom_mode > 0.5) + { + video_uv_no_geom_overscan = + get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv, + eye_pos_local, output_size_inv, geom_aspect, + geom_mode, global_to_local, pixel_to_video_uv); + } + else + { + video_uv_no_geom_overscan = flat_video_uv; + pixel_to_video_uv = float2x2( + output_size_inv.x, 0.0, 0.0, output_size_inv.y); + } + // Correct for overscan here (not in curvature code): + const float2 video_uv = + (video_uv_no_geom_overscan - float2(0.5, 0.5))/geom_overscan + float2(0.5, 0.5); + const float2 tex_uv = video_uv * (video_size * texture_size_inv); + + // Get a matrix transforming pixel vectors to tex_uv vectors: + const float2x2 pixel_to_tex_uv = + mul_scale(video_size * texture_size_inv / + geom_aspect_and_overscan.zw, pixel_to_video_uv); + + // Sample! Skip antialiasing if aa_level < 0.5 or both of these hold: + // 1.) Geometry/curvature isn't used + // 2.) Overscan == float2(1.0, 1.0) + // Skipping AA is sharper, but it's only faster with dynamic branches. + const float2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset()); + const bool need_subpixel_aa = abs_aa_r_offset.x + abs_aa_r_offset.y > 0.0; + float3 color; + if(aa_level > 0.5 && (geom_mode > 0.5 || any(bool2((geom_overscan.x != 1.0), (geom_overscan.y != 1.0))))) + { + // Sample the input with antialiasing (due to sharp phosphors, etc.): + color = tex2Daa(input_texture, tex_uv, pixel_to_tex_uv, float(frame_count)); + } + + else if(aa_level > 0.5 && need_subpixel_aa) + { + // Sample at each subpixel location: + color = tex2Daa_subpixel_weights_only( + input_texture, tex_uv, pixel_to_tex_uv); + } + else + { + color = tex2D_linearize(input_texture, tex_uv).rgb; + } + + // Dim borders and output the final result: + const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect); + const float3 final_color = color * border_dim_factor; + + FragColor = encode_output(float4(final_color, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/geometry-aa-last-pass.vs b/shaders/CRT-Royale.shader/geometry-aa-last-pass.vs new file mode 100644 index 00000000..1c99650d --- /dev/null +++ b/shaders/CRT-Royale.shader/geometry-aa-last-pass.vs @@ -0,0 +1,5263 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 tex_uv; + vec4 video_and_texture_size_inv; + vec2 output_size_inv; + vec3 eye_pos_local; + vec4 geom_aspect_and_overscan; + vec3 global_to_local_row0; + vec3 global_to_local_row1; + vec3 global_to_local_row2; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(x,y) +#define rsqrt(c) inversesqrt(c) + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#define LAST_PASS +#define SIMULATE_CRT_ON_LCD + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +#ifndef RUNTIME_GEOMETRY_TILT + // Create a local-to-global rotation matrix for the CRT's coordinate frame + // and its global-to-local inverse. See the vertex shader for details. + // It's faster to compute these statically if possible. + static const float2 sin_tilt = sin(geom_tilt_angle_static); + static const float2 cos_tilt = cos(geom_tilt_angle_static); + static const float3x3 geom_local_to_global_static = float3x3( + cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, + 0.0, cos_tilt.y, -sin_tilt.y, + -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); + static const float3x3 geom_global_to_local_static = float3x3( + cos_tilt.x, 0.0, -sin_tilt.x, + sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x, + cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x); +#endif + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "tex2Dantialias.h" + +///////////////////////// BEGIN TEX2DANTIALIAS ///////////////////////// + +#ifndef TEX2DANTIALIAS_H +#define TEX2DANTIALIAS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides antialiased and subpixel-aware tex2D lookups. +// Requires: All functions share these requirements: +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe- +// space offsets to texture uv offsets. You can get this with: +// const float2 duv_dx = ddx(tex_uv); +// const float2 duv_dy = ddy(tex_uv); +// const float2x2 pixel_to_tex_uv = float2x2( +// duv_dx.x, duv_dy.x, +// duv_dx.y, duv_dy.y); +// This is left to the user in case the current Cg profile +// doesn't support ddx()/ddy(). Ideally, the user could find +// calculate a distorted tangent-space mapping analytically. +// If not, a simple flat mapping can be obtained with: +// const float2 xy_to_uv_scale = output_size * +// video_size/texture_size; +// const float2x2 pixel_to_tex_uv = float2x2( +// xy_to_uv_scale.x, 0.0, +// 0.0, xy_to_uv_scale.y); +// Optional: To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and: +// 1.) Set an antialiasing level: +// static const float aa_level = {0 (none), +// 1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24} +// 2.) Set a filter type: +// static const float aa_filter = { +// 0 (Box, Separable), 1 (Box, Cylindrical), +// 2 (Tent, Separable), 3 (Tent, Cylindrical) +// 4 (Gaussian, Separable), 5 (Gaussian, Cylindrical) +// 6 (Cubic, Separable), 7 (Cubic, Cylindrical) +// 8 (Lanczos Sinc, Separable), +// 9 (Lanczos Jinc, Cylindrical)} +// If the input is unknown, a separable box filter is used. +// Note: Lanczos Jinc is terrible for sparse sampling, and +// using aa_axis_importance (see below) defeats the purpose. +// 3.) Mirror the sample pattern on odd frames? +// static const bool aa_temporal = {true, false] +// This helps rotational invariance but can look "fluttery." +// The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override +// (all of) the following default parameters with static or uniform +// constants (or an accessor function for subpixel offsets): +// 1.) Cubic parameters: +// static const float aa_cubic_c = 0.5; +// See http://www.imagemagick.org/Usage/filter/#mitchell +// 2.) Gaussian parameters: +// static const float aa_gauss_sigma = +// 0.5/aa_pixel_diameter; +// 3.) Set subpixel offsets. This requires an accessor function +// for compatibility with scalar runtime shader Return +// a float2 pixel offset in [-0.5, 0.5] for the red subpixel: +// float2 get_aa_subpixel_r_offset() +// The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to +// override (all of) the following default static values. However, +// the file's structure requires them to be declared static const: +// 1.) static const float aa_lanczos_lobes = 3.0; +// 2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter; +// Note the default tent/Gaussian support radii may appear +// arbitrary, but extensive testing found them nearly optimal +// for tough cases like strong distortion at low AA levels. +// (The Gaussian default is only best for practical gauss_sigma +// values; much larger gauss_sigmas ironically prefer slightly +// smaller support given sparse sampling, and vice versa.) +// 3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter; +// 4.) static const float2 aa_xy_axis_importance: +// The sparse N-queens sampling grid interacts poorly with +// negative-lobed 2D filters. However, if aliasing is much +// stronger in one direction (e.g. horizontally with a phosphor +// mask), it can be useful to downplay sample offsets along the +// other axis. The support radius in each direction scales with +// aa_xy_axis_importance down to a minimum of 0.5 (box support), +// after which point only the offsets used for calculating +// weights continue to scale downward. This works as follows: +// If aa_xy_axis_importance = float2(1.0, 1.0/support_radius), +// the vertical support radius will drop to 1.0, and we'll just +// filter vertical offsets with the first filter lobe, while +// horizontal offsets go through the full multi-lobe filter. +// If aa_xy_axis_importance = float2(1.0, 0.0), the vertical +// support radius will drop to box support, and the vertical +// offsets will be ignored entirely (essentially giving us a +// box filter vertically). The former is potentially smoother +// (but less predictable) and the default behavior of Lanczos +// jinc, whereas the latter is sharper and the default behavior +// of cubics and Lanczos sinc. +// 5.) static const float aa_pixel_diameter: You can expand the +// pixel diameter to e.g. sqrt(2.0), which may be a better +// support range for cylindrical filters (they don't +// currently discard out-of-circle samples though). +// Finally, there are two miscellaneous options: +// 1.) If you want to antialias a manually tiled texture, you can +// #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to +// fix incompatibilities with anisotropic filtering. This is +// slower, and the Cg profile must support tex2Dlod(). +// 2.) If aa_cubic_c is a runtime uniform, you can #define +// RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per +// fragment instead of at the usage site (which is used by +// default, because it enables static evaluation). +// Description: +// Each antialiased lookup follows these steps: +// 1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5] +// pixels, spanning the diameter of a rectangular box filter. +// 2.) Scale these offsets by the support diameter of the user's chosen filter. +// 3.) Using these pixel offsets from the pixel center, compute the offsets to +// predefined subpixel locations. +// 4.) Compute filter weights based on subpixel offsets. +// Much of that can often be done at compile-time. At runtime: +// 1.) Project pixel-space offsets into uv-space with a matrix multiplication +// to get the uv offsets for each sample. Rectangular pixels have a +// diameter of 1.0. Circular pixels are not currently supported, but they +// might be better with a diameter of sqrt(2.0) to ensure there are no gaps +// between them. +// 2.) Load, weight, and sum samples. +// We use a sparse bilinear sampling grid, so there are two major implications: +// 1.) We can directly project the pixel-space support box into uv-space even +// if we're upsizing. This wouldn't be the case for nearest neighbor, +// where we'd have to expand the uv-space diameter to at least the support +// size to ensure sufficient filter support. In our case, this allows us +// to treat upsizing the same as downsizing and use static weighting. :) +// 2.) For decent results, negative-lobed filters must be computed based on +// separable weights, not radial distances, because the sparse sampling +// makes no guarantees about radial distributions. Even then, it's much +// better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g. +// Lanczos2 horizontally and a box filter vertically. This is mainly due +// to the sparse N-queens sampling and a statistically enormous positive or +// negative covariance between horizontal and vertical weights. +// +// Design Decision Comments: +// "aa_temporal" mirrors the sample pattern on odd frames along the axis that +// keeps subpixel weights constant. This helps with rotational invariance, but +// it can cause distracting fluctuations, and horizontal and vertical edges +// will look the same. Using a different pattern on a shifted grid would +// exploit temporal AA better, but it would require a dynamic branch or a lot +// of conditional moves, so it's prohibitively slow for the minor benefit. + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#ifndef ANTIALIAS_OVERRIDE_BASICS + // The following settings must be static constants: + static const float aa_level = 12.0; + static const float aa_filter = 0.0; + static const bool aa_temporal = false; +#endif + +#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS + // Users may override these parameters, but the file structure requires + // them to be static constants; see the descriptions above. + static const float aa_pixel_diameter = 1.0; + static const float aa_lanczos_lobes = 3.0; + static const float aa_gauss_support = 1.0 / aa_pixel_diameter; + static const float aa_tent_support = 1.0 / aa_pixel_diameter; + + // If we're using a negative-lobed filter, default to using it horizontally + // only, and use only the first lobe vertically or a box filter, over a + // correspondingly smaller range. This compensates for the sparse sampling + // grid's typically large positive/negative x/y covariance. + static const float2 aa_xy_axis_importance = + aa_filter < 5.5 ? float2(1.0) : // Box, tent, Gaussian + aa_filter < 8.5 ? float2(1.0, 0.0) : // Cubic and Lanczos sinc + aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) : // Lanczos jinc + float2(1.0); // Default to box +#endif + +#ifndef ANTIALIAS_OVERRIDE_PARAMETERS + // Users may override these values with their own uniform or static consts. + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c = 0.5; + static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter; + // Users may override the subpixel offset accessor function with their own. + // A function is used for compatibility with scalar runtime shader + inline float2 get_aa_subpixel_r_offset() + { + return float2(0.0, 0.0); + } +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../../../../include/gamma-management.h" + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +static const float aa_box_support = 0.5; +static const float aa_cubic_support = 2.0; + + +//////////////////////////// GLOBAL NON-CONSTANTS //////////////////////////// + +// We'll want to define these only once per fragment at most. +#ifdef RUNTIME_ANTIALIAS_WEIGHTS + float aa_cubic_b; + float cubic_branch1_x3_coeff; + float cubic_branch1_x2_coeff; + float cubic_branch1_x0_coeff; + float cubic_branch2_x3_coeff; + float cubic_branch2_x2_coeff; + float cubic_branch2_x1_coeff; + float cubic_branch2_x0_coeff; +#endif + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +void assign_aa_cubic_constants() +{ + // Compute cubic coefficients on demand at runtime, and save them to global + // uniforms. The B parameter is computed from C, because "Keys cubics" + // with B = 1 - 2C are considered the highest quality. + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + if(aa_filter > 5.5 && aa_filter < 7.5) + { + aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; + cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; + cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; + } + #endif +} + +inline float4 get_subpixel_support_diam_and_final_axis_importance() +{ + // Statically select the base support radius: + static const float base_support_radius = + aa_filter < 1.5 ? aa_box_support : + aa_filter < 3.5 ? aa_tent_support : + aa_filter < 5.5 ? aa_gauss_support : + aa_filter < 7.5 ? aa_cubic_support : + aa_filter < 9.5 ? aa_lanczos_lobes : + aa_box_support; // Default to box + // Expand the filter support for subpixel filtering. + const float2 subpixel_support_radius_raw = + float2(base_support_radius) + abs(get_aa_subpixel_r_offset()); + if(aa_filter < 1.5) + { + // Ignore aa_xy_axis_importance for box filtering. + const float2 subpixel_support_diam = + 2.0 * subpixel_support_radius_raw; + const float2 final_axis_importance = float2(1.0); + return float4(subpixel_support_diam, final_axis_importance); + } + else + { + // Scale the support window by aa_xy_axis_importance, but don't narrow + // it further than box support. This allows decent vertical AA without + // messing up horizontal weights or using something silly like Lanczos4 + // horizontally with a huge vertical average over an 8-pixel radius. + const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support), + subpixel_support_radius_raw * aa_xy_axis_importance); + // Adjust aa_xy_axis_importance to compensate for what's already done: + const float2 final_axis_importance = aa_xy_axis_importance * + subpixel_support_radius_raw/subpixel_support_radius; + const float2 subpixel_support_diam = 2.0 * subpixel_support_radius; + return float4(subpixel_support_diam, final_axis_importance); + } +} + + +/////////////////////////// FILTER WEIGHT FUNCTIONS ////////////////////////// + +inline float eval_box_filter(const float dist) +{ + return float(abs(dist) <= aa_box_support); +} + +inline float eval_separable_box_filter(const float2 offset) +{ + return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support)))); +} + +inline float eval_tent_filter(const float dist) +{ + return clamp((aa_tent_support - dist)/ + aa_tent_support, 0.0, 1.0); +} + +inline float eval_gaussian_filter(const float dist) +{ + return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma)); +} + +inline float eval_cubic_filter(const float dist) +{ + // Compute coefficients like assign_aa_cubic_constants(), but statically. + #ifndef RUNTIME_ANTIALIAS_WEIGHTS + // When runtime weights are used, these values are instead written to + // global uniforms at the beginning of each tex2Daa* call. + const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; + const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; + const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; + #endif + const float abs_dist = abs(dist); + // Compute the cubic based on the Horner's method formula in: + // http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf + return (abs_dist < 1.0 ? + (cubic_branch1_x3_coeff*abs_dist + + cubic_branch1_x2_coeff)*abs_dist*abs_dist + + cubic_branch1_x0_coeff : + abs_dist < 2.0 ? + ((cubic_branch2_x3_coeff*abs_dist + + cubic_branch2_x2_coeff)*abs_dist + + cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff : + 0.0)/6.0; +} + +inline float eval_separable_cubic_filter(const float2 offset) +{ + // This is faster than using a specific float2 version: + return eval_cubic_filter(offset.x) * + eval_cubic_filter(offset.y); +} + +inline float2 eval_sinc_filter(const float2 offset) +{ + // It's faster to let the caller handle the zero case, or at least it + // was when I used macros and the shader preset took a full minute to load. + const float2 pi_offset = pi * offset; + return sin(pi_offset)/pi_offset; +} + +inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe) +{ + // Note: For sparse sampling, you really need to pick an axis to use + // Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)). + const float2 offset = FIX_ZERO(offset_unsafe); + const float2 xy_weights = eval_sinc_filter(offset) * + eval_sinc_filter(offset/aa_lanczos_lobes); + return xy_weights.x * xy_weights.y; +} + +inline float eval_jinc_filter_unorm(const float x) +{ + // This is a Jinc approximation for x in [0, 45). We'll use x in range + // [0, 4*pi) or so. There are faster/closer approximations based on + // piecewise cubics from [0, 45) and asymptotic approximations beyond that, + // but this has a maximum absolute error < 1/512, and it's simpler/faster + // for shaders...not that it's all that useful for sparse sampling anyway. + const float point3845_x = 0.38448566093564*x; + const float exp_term = exp(-(point3845_x*point3845_x)); + const float point8154_plus_x = 0.815362332840791 + x; + const float cos_term = cos(point8154_plus_x); + return ( + 0.0264727330997042*min(x, 6.83134964622778) + + 0.680823557250528*exp_term + + -0.0597255978950933*min(7.41043194481873, x)*cos_term / + (point8154_plus_x + 0.0646074538634482*(x*x) + + cos(x)*max(exp_term, cos(x) + cos_term)) - + 0.180837503591406); +} + +inline float eval_jinc_filter(const float dist) +{ + return eval_jinc_filter_unorm(pi * dist); +} + +inline float eval_lanczos_jinc_filter(const float dist) +{ + return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes); +} + + +inline float3 eval_unorm_rgb_weights(const float2 offset, + const float2 final_axis_importance) +{ + // Requires: 1.) final_axis_impportance must be computed according to + // get_subpixel_support_diam_and_final_axis_importance(). + // 2.) aa_filter must be a global constant. + // 3.) offset must be an xy pixel offset in the range: + // ([-subpixel_support_diameter.x/2, + // subpixel_support_diameter.x/2], + // [-subpixel_support_diameter.y/2, + // subpixel_support_diameter.y/2]) + // Returns: Sample weights at R/G/B destination subpixels for the + // given xy pixel offset. + const float2 offset_g = offset * final_axis_importance; + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 offset_r = offset_g - aa_r_offset * final_axis_importance; + const float2 offset_b = offset_g + aa_r_offset * final_axis_importance; + // Statically select a filter: + if(aa_filter < 0.5) + { + return float3(eval_separable_box_filter(offset_r), + eval_separable_box_filter(offset_g), + eval_separable_box_filter(offset_b)); + } + else if(aa_filter < 1.5) + { + return float3(eval_box_filter(length(offset_r)), + eval_box_filter(length(offset_g)), + eval_box_filter(length(offset_b))); + } + else if(aa_filter < 2.5) + { + return float3( + eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y), + eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y), + eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y)); + } + else if(aa_filter < 3.5) + { + return float3(eval_tent_filter(length(offset_r)), + eval_tent_filter(length(offset_g)), + eval_tent_filter(length(offset_b))); + } + else if(aa_filter < 4.5) + { + return float3( + eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y), + eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y), + eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y)); + } + else if(aa_filter < 5.5) + { + return float3(eval_gaussian_filter(length(offset_r)), + eval_gaussian_filter(length(offset_g)), + eval_gaussian_filter(length(offset_b))); + } + else if(aa_filter < 6.5) + { + return float3( + eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y), + eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y), + eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y)); + } + else if(aa_filter < 7.5) + { + return float3(eval_cubic_filter(length(offset_r)), + eval_cubic_filter(length(offset_g)), + eval_cubic_filter(length(offset_b))); + } + else if(aa_filter < 8.5) + { + return float3(eval_separable_lanczos_sinc_filter(offset_r), + eval_separable_lanczos_sinc_filter(offset_g), + eval_separable_lanczos_sinc_filter(offset_b)); + } + else if(aa_filter < 9.5) + { + return float3(eval_lanczos_jinc_filter(length(offset_r)), + eval_lanczos_jinc_filter(length(offset_g)), + eval_lanczos_jinc_filter(length(offset_b))); + } + else + { + // Default to a box, because Lanczos Jinc is so bad. ;) + return float3(eval_separable_box_filter(offset_r), + eval_separable_box_filter(offset_g), + eval_separable_box_filter(offset_b)); + } +} + + +////////////////////////////// HELPER FUNCTIONS ////////////////////////////// + +inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s) +{ + // If we're manually tiling a texture, anisotropic filtering can get + // confused. This is one workaround: + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + // TODO: Use tex2Dlod_linearize with a calculated mip level. + return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0)); + #else + return tex2D_linearize(samp, s); + #endif +} + +inline float2 get_frame_sign(const float frame) +{ + if(aa_temporal) + { + // Mirror the sampling pattern for odd frames in a direction that + // lets us keep the same subpixel sample weights: + const float frame_odd = float(fmod(frame, 2.0) > 0.5); + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0))); + return mirror; + } + else + { + return float2(1.0, 1.0); + } +} + + +///////////////////////// ANTIALIASED TEXTURE LOOKUPS //////////////////////// + +float3 tex2Daa_subpixel_weights_only(const sampler2D tex, + const float2 tex_uv, const float2x2 pixel_to_tex_uv) +{ + // This function is unlike the others: Just perform a single independent + // lookup for each subpixel. It may be very aliased. + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset); + const float color_g = tex2D_linearize(tex, tex_uv).g; + const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r; + const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b; + return float3(color_r, color_g, color_b); +} + +// The tex2Daa* functions compile very slowly due to all the macros and +// compile-time math, so only include the ones we'll actually use! +float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use an RGMS4 pattern (4-queens): + // . . Q . : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4 + // Q . . . : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4 + // . . . Q : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4 + // . Q . . : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4 + // Static screenspace sample offsets (compute some implicitly): + static const float grid_size = 4.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = w1.bgr; + const float3 w3 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + + w2 * sample2 + w3 * sample3); +} + +float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 5-queens pattern: + // . Q . . . : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5 + // . . . . Q : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5 + // . . Q . . : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5 + // Q . . . . : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5 + // . . . Q . : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5 + // Static screenspace sample offsets (compute some implicitly): + static const float grid_size = 5.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = w1.bgr; + const float3 w4 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + + w2 * sample2 + w3 * sample3 + w4 * sample4); +} + +float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 6-queens pattern with a stronger horizontal + // than vertical slant: + // . . . . Q . : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6 + // . . Q . . . : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6 + // Q . . . . . : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6 + // . . . . . Q : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6 + // . . . Q . . : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6 + // . Q . . . . : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6 + // Static screenspace sample offsets (compute some implicitly): + static const float grid_size = 6.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = w2.bgr; + const float3 w4 = w1.bgr; + const float3 w5 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 + + w3 * sample3 + w4 * sample4 + w5 * sample5); +} + +float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 7-queens pattern with a queen in the center: + // . Q . . . . . : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7 + // . . . . Q . . : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7 + // Q . . . . . . : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7 + // . . . Q . . . : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7 + // . . . . . . Q : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7 + // . . Q . . . . : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7 + // . . . . . Q . : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7 + static const float grid_size = 7.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = w2.bgr; + const float3 w5 = w1.bgr; + const float3 w6 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2; + const float3 w_sum = half_sum + half_sum.bgr + w3; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6); +} + +float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 8-queens pattern. + // . . Q . . . . . : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8 + // . . . . Q . . . : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8 + // . Q . . . . . . : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8 + // . . . . . . . Q : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8 + // Q . . . . . . . : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8 + // . . . . . . Q . : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8 + // . . . Q . . . . : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8 + // . . . . . Q . . : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8 + static const float grid_size = 8.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = w3.bgr; + const float3 w5 = w2.bgr; + const float3 w6 = w1.bgr; + const float3 w7 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, and mirror on odd frames if directed: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7); +} + +float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 12-superqueens pattern where no 3 points are + // exactly collinear. + // . . . Q . . . . . . . . : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12 + // . . . . . . . . . Q . . : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12 + // . . . . . . Q . . . . . : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12 + // . Q . . . . . . . . . . : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12 + // . . . . . . . . . . . Q : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12 + // . . . . Q . . . . . . . : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12 + // . . . . . . . Q . . . . : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12 + // Q . . . . . . . . . . . : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12 + // . . . . . . . . . . Q . : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12 + // . . . . . Q . . . . . . : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12 + // . . Q . . . . . . . . . : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12 + // . . . . . . . . Q . . . : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12 + static const float grid_size = 12.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = w5.bgr; + const float3 w7 = w4.bgr; + const float3 w8 = w3.bgr; + const float3 w9 = w2.bgr; + const float3 w10 = w1.bgr; + const float3 w11 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/w_sum; + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11); +} + +float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 16-superqueens pattern where no 3 points are + // exactly collinear. + // . . Q . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16 + // . . . . . . . . . Q . . . . . . : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16 + // . . . . . . . . . . . . Q . . . : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16 + // . . . . Q . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16 + // . . . . . . . . Q . . . . . . . : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16 + // . . . . . . . . . . . . . . Q . : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16 + // Q . . . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16 + // . . . . . . . . . . Q . . . . . : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16 + // . . . . . Q . . . . . . . . . . : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16 + // . . . . . . . . . . . . . . . Q : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16 + // . Q . . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16 + // . . . . . . . Q . . . . . . . . : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16 + // . . . . . . . . . . . Q . . . . : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16 + // . . . Q . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16 + // . . . . . . Q . . . . . . . . . : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16 + // . . . . . . . . . . . . . Q . . : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16 + static const float grid_size = 16.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = w7.bgr; + const float3 w9 = w6.bgr; + const float3 w10 = w5.bgr; + const float3 w11 = w4.bgr; + const float3 w12 = w3.bgr; + const float3 w13 = w2.bgr; + const float3 w14 = w1.bgr; + const float3 w15 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); +} + +float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 20-superqueens pattern where no 3 points are + // exactly collinear and superqueens have a squared attack radius of 13. + // . . . . . . . Q . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20 + // . . . . . . . . . . . . . . . . Q . . . : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20 + // . . . . . . . . . . . Q . . . . . . . . : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20 + // . Q . . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20 + // . . . . . Q . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20 + // . . . . . . . . . . . . . . . Q . . . . : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20 + // . . . . . . . . . . Q . . . . . . . . . : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20 + // . . . . . . . . . . . . . . . . . . . Q : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20 + // . . Q . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20 + // . . . . . . Q . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20 + // . . . . . . . . . . . . . Q . . . . . . : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20 + // . . . . . . . . . . . . . . . . . Q . . : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20 + // Q . . . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20 + // . . . . . . . . . Q . . . . . . . . . . : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20 + // . . . . Q . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20 + // . . . . . . . . . . . . . . Q . . . . . : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20 + // . . . . . . . . . . . . . . . . . . Q . : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20 + // . . . . . . . . Q . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20 + // . . . Q . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20 + // . . . . . . . . . . . . Q . . . . . . . : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20 + static const float grid_size = 20.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step; + const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step; + const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const float3 w10 = w9.bgr; + const float3 w11 = w8.bgr; + const float3 w12 = w7.bgr; + const float3 w13 = w6.bgr; + const float3 w14 = w5.bgr; + const float3 w15 = w4.bgr; + const float3 w16 = w3.bgr; + const float3 w17 = w2.bgr; + const float3 w18 = w1.bgr; + const float3 w19 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 + + w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19); +} + +float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 24-superqueens pattern where no 3 points are + // exactly collinear and superqueens have a squared attack radius of 13. + // . . . . . . Q . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24 + // . . . . . . . . . . . . . . . . Q . . . . . . . : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24 + // . . . . . . . . . . Q . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24 + // . . . . . . . . . . . . . . . . . . . . . Q . . : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24 + // . . . . . Q . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24 + // . . . . . . . . . . . . . . . Q . . . . . . . . : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24 + // . Q . . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24 + // . . . . . . . . . . . Q . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24 + // . . . . . . . . . . . . . . . . . . . Q . . . . : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24 + // . . . . . . . . . . . . . . . . . . . . . . . Q : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24 + // . . . Q . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24 + // . . . . . . . . . . . . . . Q . . . . . . . . . : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24 + // . . . . . . . . . Q . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24 + // . . . . . . . . . . . . . . . . . . . . Q . . . : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24 + // Q . . . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24 + // . . . . Q . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24 + // . . . . . . . . . . . . Q . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24 + // . . . . . . . . . . . . . . . . . . . . . . Q . : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24 + // . . . . . . . . Q . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24 + // . . . . . . . . . . . . . . . . . . Q . . . . . : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24 + // . . Q . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24 + // . . . . . . . . . . . . . Q . . . . . . . . . . : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24 + // . . . . . . . Q . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24 + // . . . . . . . . . . . . . . . . . Q . . . . . . : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24 + static const float grid_size = 24.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step; + const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step; + const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step; + const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step; + const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance); + const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance); + const float3 w12 = w11.bgr; + const float3 w13 = w10.bgr; + const float3 w14 = w9.bgr; + const float3 w15 = w8.bgr; + const float3 w16 = w7.bgr; + const float3 w17 = w6.bgr; + const float3 w18 = w5.bgr; + const float3 w19 = w4.bgr; + const float3 w20 = w3.bgr; + const float3 w21 = w2.bgr; + const float3 w22 = w1.bgr; + const float3 w23 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + + w5 + w6 + w7 + w8 + w9 + w10 + w11; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign); + const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; + const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 + + w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 + + w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23); +} + +float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // Sample on a regular 4x4 grid. This is mainly for testing. + static const float grid_size = 4.0; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample: + const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + // (We can't exploit vertical or horizontal symmetry due to uncertain + // subpixel offsets. We could fix that by rotating xy offsets with the + // subpixel structure, but...no.) + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = w7.bgr; + const float3 w9 = w6.bgr; + const float3 w10 = w5.bgr; + const float3 w11 = w4.bgr; + const float3 w12 = w3.bgr; + const float3 w13 = w2.bgr; + const float3 w14 = w1.bgr; + const float3 w15 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + // Get uv sample offsets, taking advantage of row alignment: + const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0)); + const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y)); + const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y); + const float2 sample0_uv = tex_uv + uv_offset0; + const float2 sample4_uv = sample0_uv + uv_step_y; + const float2 sample8_uv = sample0_uv + uv_step_y * 2.0; + const float2 sample12_uv = sample0_uv + uv_step_y * 3.0; + // Load samples, linearizing if necessary, etc.: + const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); +} + +float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ + // This function is for testing only: Use an NxN grid with dynamic weights. + static const int grid_size = 8; + assign_aa_cubic_constants(); + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0; + const float2 filter_space_offset_step = + subpixel_support_diameter/float2(grid_size); + const float2 sample0_filter_space_offset = + -grid_radius_in_samples * filter_space_offset_step; + // Compute xy sample offsets and subpixel weights: + float3 weights[64]; //originally grid_size * grid_size + float3 weight_sum = float3(0.0, 0.0, 0.0); + for(int i = 0; i < grid_size; ++i) + { + for(int j = 0; j < grid_size; ++j) + { + // Weights based on xy distances: + const float2 offset = sample0_filter_space_offset + + float2(j, i) * filter_space_offset_step; + const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance); + weights[i*grid_size + j] = weight; + weight_sum += weight; + } + } + // Get uv offset vectors along x and y directions: + const float2x2 true_pixel_to_tex_uv = + float2x2(pixel_to_tex_uv * aa_pixel_diameter); + const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv, + float2(filter_space_offset_step.x, 0.0)); + const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv, + float2(0.0, filter_space_offset_step.y)); + // Get a starting sample location: + const float2 sample0_uv_offset = -grid_radius_in_samples * + (uv_offset_step_x + uv_offset_step_y); + const float2 sample0_uv = tex_uv + sample0_uv_offset; + // Load, weight, and sum [linearized] samples: + float3 sum = float3(0.0, 0.0, 0.0); + const float3 weight_sum_inv = float3(1.0)/weight_sum; + for(int i = 0; i < grid_size; ++i) + { + const float2 row_i_first_sample_uv = + sample0_uv + i * uv_offset_step_y; + for(int j = 0; j < grid_size; ++j) + { + const float2 sample_uv = + row_i_first_sample_uv + j * uv_offset_step_x; + sum += weights[i*grid_size + j] * + tex2Daa_tiled_linearize(tex, sample_uv).rgb; + } + } + return sum * weight_sum_inv; +} + + +/////////////////////// ANTIALIASING CODEPATH SELECTION ////////////////////// + +inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) +{ +#define DEBUG +#ifdef DEBUG + return tex2Daa_subpixel_weights_only( + tex, tex_uv, pixel_to_tex_uv); +#else + // Statically switch between antialiasing modes/levels: + return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb : + (aa_level < 3.5) ? tex2Daa_subpixel_weights_only( + tex, tex_uv, pixel_to_tex_uv) : + (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 254.5) ? tex2Daa_debug_16x_regular( + tex, tex_uv, pixel_to_tex_uv, frame) : + tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame); +#endif +} + + +#endif // TEX2DANTIALIAS_H + +///////////////////////// END TEX2DANTIALIAS ///////////////////////// + +//#include "geometry-functions.h" + +///////////////////////// BEGIN GEOMETRY-FUNCTIONS ///////////////////////// + +#ifndef GEOMETRY_FUNCTIONS_H +#define GEOMETRY_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// already included elsewhere +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" +//#include "bind-shader-h" + + +//////////////////////////// MACROS AND CONSTANTS //////////////////////////// + +// Curvature-related constants: +#define MAX_POINT_CLOUD_SIZE 9 + + +///////////////////////////// CURVATURE FUNCTIONS ///////////////////////////// + +float2 quadratic_solve(const float a, const float b_over_2, const float c) +{ + // Requires: 1.) a, b, and c are quadratic formula coefficients + // 2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out) + // 3.) b_over_2 must be guaranteed < 0.0 (avoids a branch) + // Returns: Returns float2(first_solution, discriminant), so the caller + // can choose how to handle the "no intersection" case. The + // Kahan or Citardauq formula is used for numerical robustness. + const float discriminant = b_over_2*b_over_2 - a*c; + const float solution0 = c/(-b_over_2 + sqrt(discriminant)); + return float2(solution0, discriminant); +} + +float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec) +{ + // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's + // local coordinate frame (eye_pos_vec is a position, i.e. + // a vector from the origin to the eye/camera) + // 2.) geom_radius is a global containing the sphere's radius + // Returns: Cast a ray of direction view_vec from eye_pos_vec at a + // sphere of radius geom_radius, and return the distance to + // the first intersection in units of length(view_vec). + // http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection + // Quadratic formula coefficients (b_over_2 is guaranteed negative): + const float a = dot(view_vec, view_vec); + const float b_over_2 = dot(view_vec, eye_pos_vec); // * 2.0 factored out + const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius; + return quadratic_solve(a, b_over_2, c); +} + +float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec) +{ + // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's + // local coordinate frame (eye_pos_vec is a position, i.e. + // a vector from the origin to the eye/camera) + // 2.) geom_radius is a global containing the cylinder's radius + // Returns: Cast a ray of direction view_vec from eye_pos_vec at a + // cylinder of radius geom_radius, and return the distance to + // the first intersection in units of length(view_vec). The + // derivation of the coefficients is in Christer Ericson's + // Real-Time Collision Detection, p. 195-196, and this version + // uses LaGrange's identity to reduce operations. + // Arbitrary "cylinder top" reference point for an infinite cylinder: + const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0); + const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0); + const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec; + const float3 axis_x_view = cross(cylinder_axis_vec, view_vec); + const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec); + // Quadratic formula coefficients (b_over_2 is guaranteed negative): + const float a = dot(axis_x_view, axis_x_view); + const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view); + const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) - + geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec); + return quadratic_solve(a, b_over_2, c); +} + +float2 cylinder_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) +{ + // Requires: An xyz intersection position on a cylinder. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: Define square_uv.x to be the signed arc length in xz-space, + // and define square_uv.y = -intersection_pos_local.y (+v = -y). + // Start with a numerically robust arc length calculation. + const float angle_from_image_center = atan2(intersection_pos_local.x, + intersection_pos_local.z); + const float signed_arc_len = angle_from_image_center * geom_radius; + // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide + // by the aspect ratio to stretch the mapping appropriately: + const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y); + const float2 video_uv = square_uv / geom_aspect; + return video_uv; +} + +float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a cylinder. This is the + // inverse of cylinder_xyz_to_uv(). + // Expand video_uv by the aspect ratio to get proportionate x/y lengths, + // then calculate an xyz position for the cylindrical mapping above. + const float2 square_uv = video_uv * geom_aspect; + const float arc_len = square_uv.x; + const float angle_from_image_center = arc_len / geom_radius; + const float x_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = cos(angle_from_image_center) * geom_radius; + // Or: z = sqrt(geom_radius**2 - x**2) + // Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle) + const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos); + return intersection_pos_local; +} + +float2 sphere_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) +{ + // Requires: An xyz intersection position on a sphere. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: First define square_uv.x/square_uv.y == + // intersection_pos_local.x/intersection_pos_local.y. Then, + // length(square_uv) is the arc length from the image center + // at (0.0, 0.0, geom_radius) along the tangent great circle. + // Credit for this mapping goes to cgwg: I never managed to + // understand his code, but he told me his mapping was based on + // great circle distances when I asked him about it, which + // informed this very similar (almost identical) mapping. + // Start with a numerically robust arc length calculation between the ray- + // sphere intersection point and the image center using a method posted by + // Roger Stafford on comp.soft-sys.matlab: + // https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ + const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius); + const float cp_len = + length(cross(intersection_pos_local, image_center_pos_local)); + const float dp = dot(intersection_pos_local, image_center_pos_local); + const float angle_from_image_center = atan2(cp_len, dp); + const float arc_len = angle_from_image_center * geom_radius; + // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide + // by the aspect ratio to stretch the mapping appropriately: + const float2 square_uv_unit = normalize(float2(intersection_pos_local.x, + -intersection_pos_local.y)); + const float2 square_uv = arc_len * square_uv_unit; + const float2 video_uv = square_uv / geom_aspect; + return video_uv; +} + +float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a sphere. This is the + // inverse of sphere_xyz_to_uv(). + // Expand video_uv by the aspect ratio to get proportionate x/y lengths, + // then calculate an xyz position for the spherical mapping above. + const float2 square_uv = video_uv * geom_aspect; + // Using length or sqrt here butchers the framerate on my 8800GTS if + // this function is called too many times, and so does taking the max + // component of square_uv/square_uv_unit (program length threshold?). + //float arc_len = length(square_uv); + const float2 square_uv_unit = normalize(square_uv); + const float arc_len = square_uv.y/square_uv_unit.y; + const float angle_from_image_center = arc_len / geom_radius; + const float xy_dist_from_sphere_center = + sin(angle_from_image_center) * geom_radius; + //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len)); + const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit; + const float z_pos = cos(angle_from_image_center) * geom_radius; + const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos); + return intersection_pos_local; +} + +float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) +{ + // Requires: An xyz intersection position on a cylinder. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: Define square_uv.x to be the signed arc length in xz-space, + // and define square_uv.y == signed arc length in yz-space. + // See cylinder_xyz_to_uv() for implementation details (very similar). + const float2 angle_from_image_center = atan2( + float2(intersection_pos_local.x, -intersection_pos_local.y), + intersection_pos_local.zz); + const float2 signed_arc_len = angle_from_image_center * geom_radius; + const float2 video_uv = signed_arc_len / geom_aspect; + return video_uv; +} + +float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a sphere. This is the + // inverse of sphere_alt_xyz_to_uv(). + // See cylinder_uv_to_xyz() for implementation details (very similar). + const float2 square_uv = video_uv * geom_aspect; + const float2 arc_len = square_uv; + const float2 angle_from_image_center = arc_len / geom_radius; + const float2 xy_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos)); + return float3(xy_pos.x, -xy_pos.y, z_pos); +} + +inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local, + const float geom_mode) +{ + return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) : + intersect_cylinder(view_vec_local, eye_pos_local); +} + +inline float2 xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect, const float geom_mode) +{ + return geom_mode < 1.5 ? + sphere_xyz_to_uv(intersection_pos_local, geom_aspect) : + geom_mode < 2.5 ? + sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) : + cylinder_xyz_to_uv(intersection_pos_local, geom_aspect); +} + +inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect, + const float geom_mode) +{ + return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) : + geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) : + cylinder_uv_to_xyz(uv, geom_aspect); +} + +float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local, + const float2 geom_aspect, const float geom_mode, out float3 intersection_pos) +{ + // Get the intersection point on the primitive, given an eye position + // and view vector already in its local coordinate frame: + const float2 intersect_dist_and_discriminant = intersect(view_vec_local, + eye_pos_local, geom_mode); + const float3 intersection_pos_local = eye_pos_local + + view_vec_local * intersect_dist_and_discriminant.x; + // Save the intersection position to an output parameter: + intersection_pos = intersection_pos_local; + // Transform into uv coords, but give out-of-range coords if the + // view ray doesn't intersect the primitive in the first place: + return intersect_dist_and_discriminant.y > 0.005 ? + xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0); +} + +float3 get_ideal_global_eye_pos_for_points(float3 eye_pos, + const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE], + const int num_points) +{ + // Requires: Parameters: + // 1.) Starting eye_pos is a global 3D position at which the + // camera contains all points in global_coords[] in its FOV + // 2.) geom_aspect = get_aspect_vector( + // output_size.x / output_size.y); + // 3.) global_coords is a point cloud containing global xyz + // coords of extreme points on the simulated CRT screen. + // Globals: + // 1.) geom_view_dist must be > 0.0. It controls the "near + // plane" used to interpret flat_video_uv as a view + // vector, which controls the field of view (FOV). + // Eyespace coordinate frame: +x = right, +y = up, +z = back + // Returns: Return an eye position at which the point cloud spans as + // much of the screen as possible (given the FOV controlled by + // geom_view_dist) without being cropped or sheared. + // Algorithm: + // 1.) Move the eye laterally to a point which attempts to maximize the + // the amount we can move forward without clipping the CRT screen. + // 2.) Move forward by as much as possible without clipping the CRT. + // Get the allowed movement range by solving for the eye_pos offsets + // that result in each point being projected to a screen edge/corner in + // pseudo-normalized device coords (where xy ranges from [-0.5, 0.5] + // and z = eyespace z): + // pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)* + // geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z); + // Notes: + // The field of view is controlled by geom_view_dist's magnitude relative to + // the view vector's x and y components: + // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect + // view_vec.z = -geom_view_dist + // But for the purposes of perspective divide, it should be considered: + // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist + // view_vec.z = -1.0 + static const int max_centering_iters = 1; // Keep for easy testing. + for(int iter = 0; iter < max_centering_iters; iter++) + { + // 0.) Get the eyespace coordinates of our point cloud: + float3 eyespace_coords[MAX_POINT_CLOUD_SIZE]; + for(int i = 0; i < num_points; i++) + { + eyespace_coords[i] = global_coords[i] - eye_pos; + } + // 1a.)For each point, find out how far we can move eye_pos in each + // lateral direction without the point clipping the frustum. + // Eyespace +y = up, screenspace +y = down, so flip y after + // applying the eyespace offset (on the way to "clip space"). + // Solve for two offsets per point based on: + // (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5) + // (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5) + // offset_ul and offset_dr represent the farthest we can move the + // eye_pos up-left and down-right. Save the min of all offset_dr's + // and the max of all offset_ul's (since it's negative). + float abs_radius = abs(geom_radius); // In case anyone gets ideas. ;) + float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius); + float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius); + for(int i = 0; i < num_points; i++) + { + static const float2 flipy = float2(1.0, -1.0); + float3 eyespace_xyz = eyespace_coords[i]; + float2 offset_dr = eyespace_xyz.xy - float2(-0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + float2 offset_ul = eyespace_xyz.xy - float2(0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + offset_dr_min = min(offset_dr_min, offset_dr); + offset_ul_max = max(offset_ul_max, offset_ul); + } + // 1b.)Update eye_pos: Adding the average of offset_ul_max and + // offset_dr_min gives it equal leeway on the top vs. bottom + // and left vs. right. Recalculate eyespace_coords accordingly. + float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min); + eye_pos.xy += center_offset; + for(int i = 0; i < num_points; i++) + { + eyespace_coords[i] = global_coords[i] - eye_pos; + } + // 2a.)For each point, find out how far we can move eye_pos forward + // without the point clipping the frustum. Flip the y + // direction in advance (matters for a later step, not here). + // Solve for four offsets per point based on: + // eyespace_xyz_flipy.x * geom_view_dist / + // (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5 + // eyespace_xyz_flipy.y * geom_view_dist / + // (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5 + // eyespace_xyz_flipy.x * geom_view_dist / + // (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5 + // eyespace_xyz_flipy.y * geom_view_dist / + // (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5 + // We'll vectorize the actual computation. Take the maximum of + // these four for a single offset, and continue taking the max + // for every point (use max because offset.z is negative). + float offset_z_max = -10.0 * geom_radius * geom_view_dist; + for(int i = 0; i < num_points; i++) + { + float3 eyespace_xyz_flipy = eyespace_coords[i] * + float3(1.0, -1.0, 1.0); + float4 offset_zzzz = eyespace_xyz_flipy.zzzz + + (eyespace_xyz_flipy.xyxy * geom_view_dist) / + (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect)); + // Ignore offsets that push positive x/y values to opposite + // boundaries, and vice versa, and don't let the camera move + // past a point in the dead center of the screen: + offset_z_max = (eyespace_xyz_flipy.x < 0.0) ? + max(offset_z_max, offset_zzzz.x) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.y < 0.0) ? + max(offset_z_max, offset_zzzz.y) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.x > 0.0) ? + max(offset_z_max, offset_zzzz.z) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.y > 0.0) ? + max(offset_z_max, offset_zzzz.w) : offset_z_max; + offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z); + } + // 2b.)Update eye_pos: Add the maximum (smallest negative) z offset. + eye_pos.z += offset_z_max; + } + return eye_pos; +} + +float3 get_ideal_global_eye_pos(const float3x3 local_to_global, + const float2 geom_aspect, const float geom_mode) +{ + // Start with an initial eye_pos that includes the entire primitive + // (sphere or cylinder) in its field-of-view: + const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist); + const float3 low_view = high_view * float3(1.0, -1.0, 1.0); + const float len_sq = dot(high_view, high_view); + const float fov = abs(acos(dot(high_view, low_view)/len_sq)); + // Trigonometry/similar triangles say distance = geom_radius/sin(fov/2): + const float eye_z_spherical = geom_radius/sin(fov*0.5); + const float3 eye_pos = geom_mode < 2.5 ? + float3(0.0, 0.0, eye_z_spherical) : + float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical)); + + // Get global xyz coords of extreme sample points on the simulated CRT + // screen. Start with the center, edge centers, and corners of the + // video image. We can't ignore backfacing points: They're occluded + // by closer points on the primitive, but they may NOT be occluded by + // the convex hull of the remaining samples (i.e. the remaining convex + // hull might not envelope points that do occlude a back-facing point.) + static const int num_points = MAX_POINT_CLOUD_SIZE; + float3 global_coords[MAX_POINT_CLOUD_SIZE]; + global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode)); + global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode)); + global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode)); + global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode)); + global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode)); + global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode)); + global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode)); + global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode)); + global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode)); + // Adding more inner image points could help in extreme cases, but too many + // points will kille the framerate. For safety, default to the initial + // eye_pos if any z coords are negative: + float num_negative_z_coords = 0.0; + for(int i = 0; i < num_points; i++) + { + num_negative_z_coords += float(global_coords[0].z < 0.0); + } + // Outsource the optimized eye_pos calculation: + return num_negative_z_coords > 0.5 ? eye_pos : + get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect, + global_coords, num_points); +} + +float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local, + const float3 eye_pos_local, const float3 view_vec_global, + const float3 intersection_pos_local, const float3 normal, + const float2 output_size_inv) +{ + // Requires: See get_curved_video_uv_coords_and_tangent_matrix for + // descriptions of each parameter. + // Returns: Return a transformation matrix from 2D pixel-space vectors + // (where (+1.0, +1.0) is a vector to one pixel down-right, + // i.e. same directionality as uv texels) to 3D object-space + // vectors in the CRT's local coordinate frame (right-handed) + // ***which are tangent to the CRT surface at the intersection + // position.*** (Basically, we want to convert pixel-space + // vectors to 3D vectors along the CRT's surface, for later + // conversion to uv vectors.) + // Shorthand inputs: + const float3 pos = intersection_pos_local; + const float3 eye_pos = eye_pos_local; + // Get a piecewise-linear matrix transforming from "pixelspace" offset + // vectors (1.0 = one pixel) to object space vectors in the tangent + // plane (faster than finding 3 view-object intersections). + // 1.) Get the local view vecs for the pixels to the right and down: + const float3 view_vec_right_global = view_vec_global + + float3(output_size_inv.x, 0.0, 0.0); + const float3 view_vec_down_global = view_vec_global + + float3(0.0, -output_size_inv.y, 0.0); + const float3 view_vec_right_local = + mul(global_to_local, view_vec_right_global); + const float3 view_vec_down_local = + mul(global_to_local, view_vec_down_global); + // 2.) Using the true intersection point, intersect the neighboring + // view vectors with the tangent plane: + const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal)); + const float3 right_pos = eye_pos + (intersection_vec_dot_normal / + dot(view_vec_right_local, normal))*view_vec_right_local; + const float3 down_pos = eye_pos + (intersection_vec_dot_normal / + dot(view_vec_down_local, normal))*view_vec_down_local; + // 3.) Subtract the original intersection pos from its neighbors; the + // resulting vectors are object-space vectors tangent to the plane. + // These vectors are the object-space transformations of (1.0, 0.0) + // and (0.0, 1.0) pixel offsets, so they form the first two basis + // vectors of a pixelspace to object space transformation. This + // transformation is 2D to 3D, so use (0, 0, 0) for the third vector. + const float3 object_right_vec = right_pos - pos; + const float3 object_down_vec = down_pos - pos; + const float3x3 pixel_to_object = float3x3( + object_right_vec.x, object_down_vec.x, 0.0, + object_right_vec.y, object_down_vec.y, 0.0, + object_right_vec.z, object_down_vec.z, 0.0); + return pixel_to_object; +} + +float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local, + const float3 normal, const float2 geom_aspect, const float geom_mode) +{ + // Requires: See get_curved_video_uv_coords_and_tangent_matrix for + // descriptions of each parameter. + // Returns: Return a transformation matrix from 3D object-space vectors + // in the CRT's local coordinate frame (right-handed, +y = up) + // to 2D video_uv vectors (+v = down). + // Description: + // The TBN matrix formed by the [tangent, bitangent, normal] basis + // vectors transforms ordinary vectors from tangent->object space. + // The cotangent matrix formed by the [cotangent, cobitangent, normal] + // basis vectors transforms normal vectors (covectors) from + // tangent->object space. It's the inverse-transpose of the TBN matrix. + // We want the inverse of the TBN matrix (transpose of the cotangent + // matrix), which transforms ordinary vectors from object->tangent space. + // Start by calculating the relevant basis vectors in accordance with + // Christian Schüler's blog post "Followup: Normal Mapping Without + // Precomputed Tangents": http://www.thetenthplanet.de/archives/1180 + // With our particular uv mapping, the scale of the u and v directions + // is determined entirely by the aspect ratio for cylindrical and ordinary + // spherical mappings, and so tangent and bitangent lengths are also + // determined by it (the alternate mapping is more complex). Therefore, we + // must ensure appropriate cotangent and cobitangent lengths as well. + // Base these off the uv<=>xyz mappings for each primitive. + const float3 pos = intersection_pos_local; + static const float3 x_vec = float3(1.0, 0.0, 0.0); + static const float3 y_vec = float3(0.0, 1.0, 0.0); + // The tangent and bitangent vectors correspond with increasing u and v, + // respectively. Mathematically we'd base the cotangent/cobitangent on + // those, but we'll compute the cotangent/cobitangent directly when we can. + float3 cotangent_unscaled, cobitangent_unscaled; + // geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE. + if(geom_mode < 1.5) + { + // Sphere: + // tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x + // bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y + // inv_determinant = 1.0/length(cross(bitangent, tangent)) + // cotangent = cross(normal, bitangent) * inv_determinant + // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant + // cobitangent = cross(tangent, normal) * inv_determinant + // == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant + // Simplified (scale by inv_determinant below): + cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y; + cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x; + } + else if(geom_mode < 2.5) + { + // Sphere, alternate mapping: + // This mapping works a bit like the cylindrical mapping in two + // directions, which makes the lengths and directions more complex. + // Unfortunately, I can't find much of a shortcut: + const float3 tangent = normalize( + cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x; + const float3 bitangent = normalize( + cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y; + cotangent_unscaled = cross(normal, bitangent); + cobitangent_unscaled = cross(tangent, normal); + } + else + { + // Cylinder: + // tangent = normalize(cross(y_vec, normal)) * geom_aspect.x; + // bitangent = float3(0.0, -geom_aspect.y, 0.0); + // inv_determinant = 1.0/length(cross(bitangent, tangent)) + // cotangent = cross(normal, bitangent) * inv_determinant + // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant + // cobitangent = cross(tangent, normal) * inv_determinant + // == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant + cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y; + cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0); + } + const float3 computed_normal = + cross(cobitangent_unscaled, cotangent_unscaled); + const float inv_determinant = rsqrt(dot(computed_normal, computed_normal)); + const float3 cotangent = cotangent_unscaled * inv_determinant; + const float3 cobitangent = cobitangent_unscaled * inv_determinant; + // The [cotangent, cobitangent, normal] column vecs form the cotangent + // frame, i.e. the inverse-transpose TBN matrix. Get its transpose: + const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal); + return object_to_tangent; +} + +float2 get_curved_video_uv_coords_and_tangent_matrix( + const float2 flat_video_uv, const float3 eye_pos_local, + const float2 output_size_inv, const float2 geom_aspect, + const float geom_mode, const float3x3 global_to_local, + out float2x2 pixel_to_tangent_video_uv) +{ + // Requires: Parameters: + // 1.) flat_video_uv coords are in range [0.0, 1.0], where + // (0.0, 0.0) is the top-left corner of the screen and + // (1.0, 1.0) is the bottom-right corner. + // 2.) eye_pos_local is the 3D camera position in the simulated + // CRT's local coordinate frame. For best results, it must + // be computed based on the same geom_view_dist used here. + // 3.) output_size_inv = float2(1.0)/output_size + // 4.) geom_aspect = get_aspect_vector( + // output_size.x / output_size.y); + // 5.) geom_mode is a static or runtime mode setting: + // 0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder + // 6.) global_to_local is a 3x3 matrix transforming (ordinary) + // worldspace vectors to the CRT's local coordinate frame + // Globals: + // 1.) geom_view_dist must be > 0.0. It controls the "near + // plane" used to interpret flat_video_uv as a view + // vector, which controls the field of view (FOV). + // Returns: Return final uv coords in [0.0, 1.0], and return a pixel- + // space to video_uv tangent-space matrix in the out parameter. + // (This matrix assumes pixel-space +y = down, like +v = down.) + // We'll transform flat_video_uv into a view vector, project + // the view vector from the camera/eye, intersect with a sphere + // or cylinder representing the simulated CRT, and convert the + // intersection position into final uv coords and a local + // transformation matrix. + // First get the 3D view vector (geom_aspect and geom_view_dist are globals): + // 1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5) + // correspond to the top-left/bottom-right output screen corners. + // 2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen- + // space 2D aspect correction. We'll reapply it in uv-space. + // 3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y + // is up in 3D worldspace (enforce a right-handed system). + // 4.) The view vector z controls the "near plane" distance and FOV. + // For the effect of "looking through a window" at a CRT, it should be + // set equal to the user's distance from their physical screen, in + // units of the viewport's physical diagonal size. + const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect; + const float3 view_vec_global = + float3(view_uv.x, -view_uv.y, -geom_view_dist); + // Transform the view vector into the CRT's local coordinate frame, convert + // to video_uv coords, and get the local 3D intersection position: + const float3 view_vec_local = mul(global_to_local, view_vec_global); + float3 pos; + const float2 centered_uv = view_vec_to_uv( + view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos); + const float2 video_uv = centered_uv + float2(0.5); + // Get a pixel-to-tangent-video-uv matrix. The caller could deal with + // all but one of these cases, but that would be more complicated. + #ifdef DRIVERS_ALLOW_DERIVATIVES + // Derivatives obtain a matrix very fast, but the direction of pixel- + // space +y seems to depend on the pass. Enforce the correct direction + // on a best-effort basis (but it shouldn't matter for antialiasing). + const float2 duv_dx = ddx(video_uv); + const float2 duv_dy = ddy(video_uv); + #ifdef LAST_PASS + pixel_to_tangent_video_uv = float2x2( + duv_dx.x, duv_dy.x, + -duv_dx.y, -duv_dy.y); + #else + pixel_to_tangent_video_uv = float2x2( + duv_dx.x, duv_dy.x, + duv_dx.y, duv_dy.y); + #endif + #else + // Manually define a transformation matrix. We'll assume pixel-space + // +y = down, just like +v = down. + if(geom_force_correct_tangent_matrix) + { + // Get the surface normal based on the local intersection position: + const float3 normal_base = geom_mode < 2.5 ? pos : + float3(pos.x, 0.0, pos.z); + const float3 normal = normalize(normal_base); + // Get pixel-to-object and object-to-tangent matrices and combine + // them into a 2x2 pixel-to-tangent matrix for video_uv offsets: + const float3x3 pixel_to_object = get_pixel_to_object_matrix( + global_to_local, eye_pos_local, view_vec_global, pos, normal, + output_size_inv); + const float3x3 object_to_tangent = get_object_to_tangent_matrix( + pos, normal, geom_aspect, geom_mode); + const float3x3 pixel_to_tangent3x3 = + mul(object_to_tangent, pixel_to_object); + pixel_to_tangent_video_uv = float2x2( + pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11); //TODO/FIXME: needs to correct for column-major?? + } + else + { + // Ignore curvature, and just consider flat scaling. The + // difference is only apparent with strong curvature: + pixel_to_tangent_video_uv = float2x2( + output_size_inv.x, 0.0, 0.0, output_size_inv.y); + } + #endif + return video_uv; +} + +float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect) +{ + // COPYRIGHT NOTE FOR THIS FUNCTION: + // Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey + // This function uses an algorithm first coded in several of cgwg's GPL- + // licensed lines in crt-geom-curved.cg and its ancestors. The line + // between algorithm and code is nearly indistinguishable here, so it's + // unclear whether I could even release this project under a non-GPL + // license with this function included. + + // Calculate border_dim_factor from the proximity to uv-space image + // borders; geom_aspect/border_size/border/darkness/border_compress are globals: + const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) * + geom_aspect; + const float2 border_penetration = + max(float2(border_size) - edge_dists, float2(0.0)); + const float penetration_ratio = length(border_penetration)/border_size; + const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0); + const float border_dim_factor = + pow(border_escape_ratio, border_darkness) * max(1.0, border_compress); + return min(border_dim_factor, 1.0); +} + + + +#endif // GEOMETRY_FUNCTIONS_H + +///////////////////////// END GEOMETRY-FUNCTIONS ///////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +float2x2 mul_scale(float2 scale, float2x2 matrix) +{ + //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y); + //return mul(scale_matrix, matrix); + float4 intermed = float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy; + return float2x2(intermed.x, intermed.y, intermed.z, intermed.w); +} + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + gl_Position = position; + vTexCoord = texCoord * 1.0001; + tex_uv = vTexCoord.xy; + video_and_texture_size_inv = + float4(1.0, 1.0, 1.0, 1.0) / float4(video_size, texture_size); + output_size_inv = float2(1.0, 1.0)/output_size; + + // Get aspect/overscan vectors from scalar parameters (likely uniforms): + const float viewport_aspect_ratio = output_size.x/output_size.y; + const float2 geom_aspect = get_aspect_vector(viewport_aspect_ratio); + const float2 geom_overscan = get_geom_overscan_vector(); + geom_aspect_and_overscan = float4(geom_aspect, geom_overscan); + + #ifdef RUNTIME_GEOMETRY_TILT + // Create a local-to-global rotation matrix for the CRT's coordinate + // frame and its global-to-local inverse. Rotate around the x axis + // first (pitch) and then the y axis (yaw) with yucky Euler angles. + // Positive angles go clockwise around the right-vec and up-vec. + // Runtime shader parameters prevent us from computing these globally, + // but we can still combine the pitch/yaw matrices by hand to cut a + // few instructions. Note that cg matrices fill row1 first, then row2, + // etc. (row-major order). + const float2 geom_tilt_angle = get_geom_tilt_angle_vector(); + const float2 sin_tilt = sin(geom_tilt_angle); + const float2 cos_tilt = cos(geom_tilt_angle); + // Conceptual breakdown: + static const float3x3 rot_x_matrix = float3x3( + 1.0, 0.0, 0.0, + 0.0, cos_tilt.y, -sin_tilt.y, + 0.0, sin_tilt.y, cos_tilt.y); + static const float3x3 rot_y_matrix = float3x3( + cos_tilt.x, 0.0, sin_tilt.x, + 0.0, 1.0, 0.0, + -sin_tilt.x, 0.0, cos_tilt.x); + static const float3x3 local_to_global = + mul(rot_y_matrix, rot_x_matrix); +/* static const float3x3 global_to_local = + transpose(local_to_global); + const float3x3 local_to_global = float3x3( + cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, + 0.0, cos_tilt.y, sin_tilt.y, + sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); +*/ // This is a pure rotation, so transpose = inverse: + const float3x3 global_to_local = transpose(local_to_global); + // Decompose the matrix into 3 float3's for output: + global_to_local_row0 = float3(global_to_local[0][0], global_to_local[0][1], global_to_local[0][2]);//._m00_m01_m02); + global_to_local_row1 = float3(global_to_local[1][0], global_to_local[1][1], global_to_local[1][2]);//._m10_m11_m12); + global_to_local_row2 = float3(global_to_local[2][0], global_to_local[2][1], global_to_local[2][2]);//._m20_m21_m22); + #else + static const float3x3 global_to_local = geom_global_to_local_static; + static const float3x3 local_to_global = geom_local_to_global_static; + #endif + + // Get an optimal eye position based on geom_view_dist, viewport_aspect, + // and CRT radius/rotation: + #ifdef RUNTIME_GEOMETRY_MODE + const float geom_mode = geom_mode_runtime; + #else + static const float geom_mode = geom_mode_static; + #endif + const float3 eye_pos_global = + get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode); + eye_pos_local = mul(global_to_local, eye_pos_global); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/manifest.bml b/shaders/CRT-Royale.shader/manifest.bml new file mode 100644 index 00000000..778c0689 --- /dev/null +++ b/shaders/CRT-Royale.shader/manifest.bml @@ -0,0 +1,214 @@ +input + filter: nearest + +// IMPORTANT: +// Shader passes need to know details about the image in the mask_texture LUT +// files, so set the following constants in user-preset-constants.h accordingly: +// 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's) +// 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's) +// 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's) +// 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1]) +// 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1]) +// 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1]) +// Shader passes also need to know certain scales set in this preset, but their +// compilation model doesn't currently allow the preset file to tell them. Make +// sure to set the following constants in user-preset-constants.h accordingly too: +// 1.) bloom_approx_scale_x = scale_x2 +// 2.) mask_resize_viewport_scale = vec2(scale_x6, scale_y5) +// Finally, shader passes need to know the value of geom_max_aspect_ratio used to +// calculate scale_y5 (among other values): +// 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5) + +// Pass0: Linearize the input based on CRT gamma and bob interlaced fields. +// (Bobbing ensures we can immediately blur without getting artifacts.) +program + filter: nearest + vertex: first-pass-linearize-crt-gamma-bob-fields.vs + fragment: first-pass-linearize-crt-gamma-bob-fields.fs + format: rgba16f + height: 100% + width: 100% + +// Pass1: Resample interlaced (and misconverged) scanlines vertically. +// Separating vertical/horizontal scanline sampling is faster: It lets us +// consider more scanlines while calculating weights for fewer pixels, and +// it reduces our samples from vertical*horizontal to vertical+horizontal. +// This has to come right after ORIG_LINEARIZED, because there's no +// "original_source" scale_type we can use later. +program + filter: linear + vertex: scanlines-vertical-interlacing.vs + fragment: scanlines-vertical-interlacing.fs + height: 400% + width: 100% + format: rgba16f + +// Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and +// account for convergence offsets. We want to blur a predictable portion of the +// screen to match the phosphor bloom, and absolute scale works best for +// reliable results with a fixed-size bloom. Picking a scale is tricky: +// a.) 400x300 is a good compromise for the "fake-bloom" version: It's low enough +// to blur high-res/interlaced sources but high enough that resampling +// doesn't smear low-res sources too much. +// b.) 320x240 works well for the "real bloom" version: It's 1-1.5% faster, and +// the only noticeable visual difference is a larger halation spread (which +// may be a good thing for people who like to crank it up). +// Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's +// *intended* for an ~4:3 aspect ratio). +program + filter: linear + vertex: bloom-approx.vs + fragment: bloom-approx.fs + format: rgba16f + width: 320 px + height: 240 px + +// Pass3: Vertically blur the input for halation and refractive diffusion. +// Base this on BLOOM_APPROX: This blur should be small and fast, and blurring +// a constant portion of the screen is probably physically correct if the +// viewport resolution is proportional to the simulated CRT size. +program + filter: linear + vertex: blur9fast-vertical.vs + fragment: blur9fast-vertical.fs + format: rgba16f + height: 100% + width: 100% + +// Pass4: Horizontally blur the input for halation and refractive diffusion. +// Note: Using a one-pass 9x9 blur is about 1% slower. +program + filter: linear + vertex: blur9fast-horizontal.vs + fragment: blur9fast-horizontal.fs + format: rgba16f + height: 100% + width: 100% + +// Pass5: Lanczos-resize the phosphor mask vertically. Set the absolute +// scale_x5 == mask_texture_small_size.x (see IMPORTANT above). Larger scales +// will blur, and smaller scales could get nasty. The vertical size must be +// based on the viewport size and calculated carefully to avoid artifacts later. +// First calculate the minimum number of mask tiles we need to draw. +// Since curvature is computed after the scanline masking pass: +// num_resized_mask_tiles = 2.0; +// If curvature were computed in the scanline masking pass (it's not): +// max_mask_texel_border = ~3.0 * (1/3.0 + 4.0*sqrt(2.0) + 0.5 + 1.0); +// max_mask_tile_border = max_mask_texel_border/ +// (min_resized_phosphor_triad_size * mask_triads_per_tile); +// num_resized_mask_tiles = max(2.0, 1.0 + max_mask_tile_border * 2.0); +// At typical values (triad_size >= 2.0, mask_triads_per_tile == 8): +// num_resized_mask_tiles = ~3.8 +// Triad sizes are given in horizontal terms, so we need geom_max_aspect_ratio +// to relate them to vertical resolution. The widest we expect is: +// geom_max_aspect_ratio = 4.0/3.0 // Note: Shader passes need to know this! +// The fewer triads we tile across the screen, the larger each triad will be as a +// fraction of the viewport size, and the larger scale_y5 must be to draw a full +// num_resized_mask_tiles. Therefore, we must decide the smallest number of +// triads we'll guarantee can be displayed on screen. We'll set this according +// to 3-pixel triads at 768p resolution (the lowest anyone's likely to use): +// min_allowed_viewport_triads = 768.0*geom_max_aspect_ratio / 3.0 = 341.333333 +// Now calculate the viewport scale that ensures we can draw resized_mask_tiles: +// min_scale_x = resized_mask_tiles * mask_triads_per_tile / +// min_allowed_viewport_triads +// scale_y5 = geom_max_aspect_ratio * min_scale_x +// # Some code might depend on equal scales: +// scale_x6 = scale_y5 +// Given our default geom_max_aspect_ratio and min_allowed_viewport_triads: +// scale_y5 = 4.0/3.0 * 2.0/(341.33333 / 8.0) = 0.0625 +// IMPORTANT: The scales MUST be calculated in this way. If you wish to change +// geom_max_aspect_ratio, update that constant in user-preset-constants.h! +program + filter: linear + format: rgba16f + width: 64 px + height: 6.25% + vertex: mask-resize-vertical.vs + fragment: mask-resize-vertical.fs + pixmap: textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearShadowMaskEDPResizeTo64.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearShadowMaskEDP.png + filter: linear + wrap: repeat + +// Pass6: Lanczos-resize the phosphor mask horizontally. scale_x6 = scale_y5. +// TODO: Check again if the shaders actually require equal scales. +program + filter: nearest + vertex: mask-resize-horizontal.vs + fragment: mask-resize-horizontal.fs + format: rgba16f + +// Pass7: Resample (misconverged) scanlines horizontally, apply halation, and +// apply the phosphor mask. +program + filter: linear + format: rgba16f + height: 100% + width: 100% + vertex: scanlines-horizontal-apply-mask.vs + fragment: scanlines-horizontal-apply-mask.fs + pixmap: textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearShadowMaskEDPResizeTo64.png + filter: linear + wrap: repeat + pixmap: textures/TileableLinearShadowMaskEDP.png + filter: linear + wrap: repeat + +// Pass 8: Compute a brightpass. This will require reading the final mask. +program + filter: linear + format: rgba16f + vertex: brightpass.vs + fragment: brightpass.fs + +// Pass 9: Blur the brightpass vertically +program + filter: linear + format: rgba16f + vertex: bloom-vertical.vs + fragment: bloom-vertical.fs + +// Pass 10: Blur the brightpass horizontally and combine it with the dimpass: +program + filter: linear + format: rgba16f + height: 100% + width: 100% + vertex: bloom-horizontal-reconstitute.vs + fragment: bloom-horizontal-reconstitute.fs + +// Pass 11: Compute curvature/AA: +program + filter: linear + format: rgba16f + vertex: geometry-aa-last-pass.vs + fragment: geometry-aa-last-pass.fs + +output + filter: nearest \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/mask-resize-horizontal.fs b/shaders/CRT-Royale.shader/mask-resize-horizontal.fs new file mode 100644 index 00000000..8545d587 --- /dev/null +++ b/shaders/CRT-Royale.shader/mask-resize-horizontal.fs @@ -0,0 +1,3208 @@ +#version 150 + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; + +in Vertex { + vec2 vTexCoord; + vec2 src_tex_uv_wrap; + vec2 tile_uv_wrap; + vec2 resize_magnification_scale; + vec2 src_dxdy; + vec2 tile_size_uv; + vec2 input_tiles_per_texture; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + // The input contains one mask tile horizontally and a number vertically. + // Resize the tile horizontally to its final screen size and repeat it + // until drawing at least mask_resize_num_tiles, leaving it unchanged + // vertically. Lanczos-resizing the phosphor mask achieves much sharper + // results than mipmapping, outputting >= mask_resize_num_tiles makes for + // easier tiled sampling later. + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + // Discard unneeded fragments in case our profile allows real branches. + //const float2 tile_uv_wrap = tile_uv_wrap; + if(get_mask_sample_mode() < 0.5 && + max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles) + { + const float src_dx = src_dxdy.x; + const float2 src_tex_uv = frac(src_tex_uv_wrap); + const float3 pixel_color = downsample_horizontal_sinc_tiled(input_texture, + src_tex_uv, texture_size, src_dxdy.x, + resize_magnification_scale.x, tile_size_uv.x); + // The input LUT was linear RGB, and so is our output: + FragColor = float4(pixel_color, 1.0); + } + else + { + discard; + } + #else + discard; + FragColor = float4(1.0,1.0,1.0,1.0); + #endif +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/mask-resize-horizontal.vs b/shaders/CRT-Royale.shader/mask-resize-horizontal.vs new file mode 100644 index 00000000..b64cf9c8 --- /dev/null +++ b/shaders/CRT-Royale.shader/mask-resize-horizontal.vs @@ -0,0 +1,3236 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 src_tex_uv_wrap; + vec2 tile_uv_wrap; + vec2 resize_magnification_scale; + vec2 src_dxdy; + vec2 tile_size_uv; + vec2 input_tiles_per_texture; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; +uniform int phase; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + gl_Position = position; + vTexCoord = texCoord * 1.0001; + float2 tex_uv = vTexCoord.xy; + // First estimate the viewport size (the user will get the wrong number of + // triads if it's wrong and mask_specify_num_triads is 1.0/true). + const float2 estimated_viewport_size = + output_size / mask_resize_viewport_scale; + // Find the final size of our resized phosphor mask tiles. We probably + // estimated the viewport size and MASK_RESIZE output size differently last + // pass, so do not swear they were the same. ;) + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + estimated_viewport_size, output_size, false); + + // We'll render resized tiles until filling the output FBO or meeting a + // limit, so compute [wrapped] tile uv coords based on the output uv coords + // and the number of tiles that will fit in the FBO. + const float2 output_tiles_this_pass = output_size / mask_resize_tile_size; + const float2 output_video_uv = tex_uv * texture_size / video_size; + const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass; + + // Get the texel size of an input tile and related values: + const float2 input_tile_size = float2(min( + mask_resize_src_lut_size.x, video_size.x), mask_resize_tile_size.y); + tile_size_uv = input_tile_size / texture_size; + input_tiles_per_texture = texture_size / input_tile_size; + + // Derive [wrapped] texture uv coords from [wrapped] tile uv coords and + // the tile size in uv coords, and save frac() for the fragment shader. + src_tex_uv_wrap = tile_uv_wrap * tile_size_uv; + + // Output the values we need, including the magnification scale and step: + //tile_uv_wrap = tile_uv_wrap; + //src_tex_uv_wrap = src_tex_uv_wrap; + resize_magnification_scale = mask_resize_tile_size / input_tile_size; + src_dxdy = float2(1.0/texture_size.x, 0.0); + //tile_size_uv = tile_size_uv; + //input_tiles_per_texture = input_tiles_per_texture; +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/mask-resize-vertical.fs b/shaders/CRT-Royale.shader/mask-resize-vertical.fs new file mode 100644 index 00000000..16e8090e --- /dev/null +++ b/shaders/CRT-Royale.shader/mask-resize-vertical.fs @@ -0,0 +1,3248 @@ +#version 150 + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; +uniform sampler2D pixmap[]; +uniform int phase; + +in Vertex { + vec2 vTexCoord; + vec2 src_tex_uv_wrap; + vec2 resize_magnification_scale; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define mask_grille_texture_small pixmap[0] +#define mask_slot_texture_small pixmap[2] +#define mask_shadow_texture_small pixmap[4] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + // Resize the input phosphor mask tile to the final vertical size it will + // appear on screen. Keep 1x horizontal size if possible (IN.output_size + // >= mask_resize_src_lut_size), and otherwise linearly sample horizontally + // to fit exactly one tile. Lanczos-resizing the phosphor mask achieves + // much sharper results than mipmapping, and vertically resizing first + // minimizes the total number of taps required. We output a number of + // resized tiles >= mask_resize_num_tiles for easier tiled sampling later. + //const float2 src_tex_uv_wrap = src_tex_uv_wrap; + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + // Discard unneeded fragments in case our profile allows real branches. + const float2 tile_uv_wrap = src_tex_uv_wrap; + if(get_mask_sample_mode() < 0.5 && + tile_uv_wrap.y <= mask_resize_num_tiles) + { + static const float src_dy = 1.0/mask_resize_src_lut_size.y; + const float2 src_tex_uv = frac(src_tex_uv_wrap); + float3 pixel_color; + // If mask_type is static, this branch will be resolved statically. + #ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + if(mask_type < 0.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else if(mask_type < 1.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else + { + pixel_color = downsample_vertical_sinc_tiled( + mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + #else + if(mask_type < 0.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else if(mask_type < 1.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else + { + pixel_color = downsample_vertical_sinc_tiled( + mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + #endif + // The input LUT was linear RGB, and so is our output: + FragColor = float4(pixel_color, 1.0); + } + else + { + discard; + } + #else + discard; + FragColor = float4(1.0, 1.0, 1.0, 1.0); + #endif +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/mask-resize-vertical.vs b/shaders/CRT-Royale.shader/mask-resize-vertical.vs new file mode 100644 index 00000000..2dac429b --- /dev/null +++ b/shaders/CRT-Royale.shader/mask-resize-vertical.vs @@ -0,0 +1,3212 @@ +#version 150 + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 src_tex_uv_wrap; + vec2 resize_magnification_scale; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; +uniform int phase; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + gl_Position = position; + vTexCoord = texCoord; + float2 tex_uv = vTexCoord.xy; + // First estimate the viewport size (the user will get the wrong number of + // triads if it's wrong and mask_specify_num_triads is 1.0/true). + const float viewport_y = output_size.y / mask_resize_viewport_scale.y; + const float aspect_ratio = geom_aspect_ratio_x / geom_aspect_ratio_y; + const float2 estimated_viewport_size = + float2(viewport_y * aspect_ratio, viewport_y); + // Estimate the output size of MASK_RESIZE (the next pass). The estimated + // x component shouldn't matter, because we're not using the x result, and + // we're not swearing it's correct (if we did, the x result would influence + // the y result to maintain the tile aspect ratio). + const float2 estimated_mask_resize_output_size = + float2(output_size.y * aspect_ratio, output_size.y); + // Find the final intended [y] size of our resized phosphor mask tiles, + // then the tile size for the current pass (resize y only): + float2 mask_resize_tile_size = get_resized_mask_tile_size( + estimated_viewport_size, estimated_mask_resize_output_size, false); + float2 pass_output_tile_size = float2(min( + mask_resize_src_lut_size.x, output_size.x), mask_resize_tile_size.y); + + // We'll render resized tiles until filling the output FBO or meeting a + // limit, so compute [wrapped] tile uv coords based on the output uv coords + // and the number of tiles that will fit in the FBO. + const float2 output_tiles_this_pass = output_size / pass_output_tile_size; + const float2 output_video_uv = tex_uv * texture_size / video_size; + const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass; + + // The input LUT is just a single mask tile, so texture uv coords are the + // same as tile uv coords (save frac() for the fragment shader). The + // magnification scale is also straightforward: + src_tex_uv_wrap = tile_uv_wrap; + resize_magnification_scale = + pass_output_tile_size / mask_resize_src_lut_size; +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs new file mode 100644 index 00000000..a987afbb --- /dev/null +++ b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs @@ -0,0 +1,10845 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; +uniform int phase; +uniform sampler2D pixmap[]; + +in Vertex { + vec2 vTexCoord; + vec2 video_uv; + vec2 scanline_tex_uv; + vec2 blur3x3_tex_uv; + vec2 halation_tex_uv; + vec2 scanline_texture_size_inv; + vec4 mask_tile_start_uv_and_size; + vec2 mask_tiles_per_screen; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +#define VERTICAL_SCANLINEStexture source[5] +#define VERTICAL_SCANLINEStexture_size sourceSize[5].xy +#define VERTICAL_SCANLINESvideo_size sourceSize[5].xy +#define BLOOM_APPROXtexture source[4] +#define BLOOM_APPROXtexture_size sourceSize[4].xy +#define BLOOM_APPROXvideo_size sourceSize[4].xy +#define HALATION_BLURtexture source[2] +#define HALATION_BLURtexture_size sourceSize[2].xy +#define HALATION_BLURvideo_size sourceSize[2].xy +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #define MASK_RESIZEtexture source[0] +#else + #define MASK_RESIZEtexture source[0] +#endif +#define MASK_RESIZEtexture_size sourceSize[0] +#define MASK_RESIZEvideo_size sourceSize[0] + +#define input_texture source[0] +#define mask_grille_texture_large pixmap[1] +#define mask_slot_texture_large pixmap[3] +#define mask_shadow_texture_large pixmap[5] + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +// VERTEX INCLUDES // + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +//#include "../../../../include/gamma-management.h" +// already got it + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 tex2Dtiled_mask_linearize(const sampler2D tex, + const float2 tex_uv) +{ + // If we're manually tiling a texture, anisotropic filtering can get + // confused. One workaround is to just select the lowest mip level: + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + // TODO: Use tex2Dlod_linearize with a calculated mip level. + return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0)); + #else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0)); + #else + return tex2D_linearize(tex, tex_uv); + #endif + #endif + #else + return tex2D_linearize(tex, tex_uv); + #endif +} + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +// END VERTEX INCLUDES // + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +//#include "bloom-functions.h" + +//////////////////////////// BEGIN BLOOM-FUNCTIONS /////////////////////////// + +#ifndef BLOOM_FUNCTIONS_H +#define BLOOM_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These utility functions and constants help several passes determine the +// size and center texel weight of the phosphor bloom in a uniform manner. + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// We need to calculate the correct blur sigma using some .cgp constants: +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/blur-functions.h" + +//////////////////////////// BEGIN BLUR-FUNCTIONS /////////////////////////// + +#ifndef BLUR_FUNCTIONS_H +#define BLUR_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides reusable one-pass and separable (two-pass) blurs. +// Requires: All blurs share these requirements (dxdy requirement is split): +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) filter_linearN must == "true" in your .cgp preset unless +// you're using tex2DblurNresize at 1x scale. +// 3.) mipmap_inputN must == "true" in your .cgp preset if +// output_size < video_size. +// 4.) output_size == video_size / pow(2, M), where M is some +// positive integer. tex2Dblur*resize can resize arbitrarily +// (and the blur will be done after resizing), but arbitrary +// resizes "fail" with other blurs due to the way they mix +// static weights with bilinear sample exploitation. +// 5.) In general, dxdy should contain the uv pixel spacing: +// dxdy = (video_size/output_size)/texture_size +// 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), +// zero out the dxdy component in the unblurred dimension: +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) +// Many blurs share these requirements: +// 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, +// or they will blur more in the lower-scaled dimension. +// 2.) One-pass shared sample blurs require ddx(), ddy(), and +// tex2Dlod() to be supported by the current Cg profile, and +// the drivers must support high-quality derivatives. +// 3.) One-pass shared sample blurs require: +// tex_uv.w == log2(video_size/output_size).y; +// Non-wrapper blurs share this requirement: +// 1.) sigma is the intended standard deviation of the blur +// Wrapper blurs share this requirement, which is automatically +// met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below): +// 1.) blurN_std_dev must be global static const float values +// specifying standard deviations for Nx blurs in units +// of destination pixels +// Optional: 1.) The including file (or an earlier included file) may +// optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace +// default standard deviations with those matching a binomial +// distribution. (See below for details/properties.) +// 2.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_BLUR_STD_DEVS and override: +// static const float blur3_std_dev +// static const float blur4_std_dev +// static const float blur5_std_dev +// static const float blur6_std_dev +// static const float blur7_std_dev +// static const float blur8_std_dev +// static const float blur9_std_dev +// static const float blur10_std_dev +// static const float blur11_std_dev +// static const float blur12_std_dev +// static const float blur17_std_dev +// static const float blur25_std_dev +// static const float blur31_std_dev +// static const float blur43_std_dev +// 3.) The including file (or an earlier included file) may +// optionally #define OVERRIDE_ERROR_BLURRING and override: +// static const float error_blurring +// This tuning value helps mitigate weighting errors from one- +// pass shared-sample blurs sharing bilinear samples between +// fragments. Values closer to 0.0 have "correct" blurriness +// but allow more artifacts, and values closer to 1.0 blur away +// artifacts by sampling closer to halfway between texels. +// UPDATE 6/21/14: The above static constants may now be overridden +// by non-static uniform constants. This permits exposing blur +// standard deviations as runtime GUI shader parameters. However, +// using them keeps weights from being statically computed, and the +// speed hit depends on the blur: On my machine, uniforms kill over +// 53% of the framerate with tex2Dblur12x12shared, but they only +// drop the framerate by about 18% with tex2Dblur11fast. +// Quality and Performance Comparisons: +// For the purposes of the following discussion, "no sRGB" means +// GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't. +// 1.) tex2DblurNfast is always faster than tex2DblurNresize. +// 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize +// well, because they're the only ones that don't exploit bilinear samples. +// This also means they're the only functions which can be truly gamma- +// correct without linear (or sRGB FBO) input, but only at 1x scale. +// 3.) One-pass shared sample blurs only have a speed advantage without sRGB. +// They also have some inaccuracies due to their shared-[bilinear-]sample +// design, which grow increasingly bothersome for smaller blurs and higher- +// frequency source images (relative to their resolution). I had high +// hopes for them, but their most realistic use case is limited to quickly +// reblurring an already blurred input at full resolution. Otherwise: +// a.) If you're blurring a low-resolution source, you want a better blur. +// b.) If you're blurring a lower mipmap, you want a better blur. +// c.) If you're blurring a high-resolution, high-frequency source, you +// want a better blur. +// 4.) The one-pass blurs without shared samples grow slower for larger blurs, +// but they're competitive with separable blurs at 5x5 and smaller, and +// even tex2Dblur7x7 isn't bad if you're wanting to conserve passes. +// Here are some framerates from a GeForce 8800GTS. The first pass resizes to +// viewport size (4x in this test) and linearizes for sRGB codepaths, and the +// remaining passes perform 6 full blurs. Mipmapped tests are performed at the +// same scale, so they just measure the cost of mipmapping each FBO (only every +// other FBO is mipmapped for separable blurs, to mimic realistic usage). +// Mipmap Neither sRGB+Mipmap sRGB Function +// 76.0 92.3 131.3 193.7 tex2Dblur3fast +// 63.2 74.4 122.4 175.5 tex2Dblur3resize +// 93.7 121.2 159.3 263.2 tex2Dblur3x3 +// 59.7 68.7 115.4 162.1 tex2Dblur3x3resize +// 63.2 74.4 122.4 175.5 tex2Dblur5fast +// 49.3 54.8 100.0 132.7 tex2Dblur5resize +// 59.7 68.7 115.4 162.1 tex2Dblur5x5 +// 64.9 77.2 99.1 137.2 tex2Dblur6x6shared +// 55.8 63.7 110.4 151.8 tex2Dblur7fast +// 39.8 43.9 83.9 105.8 tex2Dblur7resize +// 40.0 44.2 83.2 104.9 tex2Dblur7x7 +// 56.4 65.5 71.9 87.9 tex2Dblur8x8shared +// 49.3 55.1 99.9 132.5 tex2Dblur9fast +// 33.3 36.2 72.4 88.0 tex2Dblur9resize +// 27.8 29.7 61.3 72.2 tex2Dblur9x9 +// 37.2 41.1 52.6 60.2 tex2Dblur10x10shared +// 44.4 49.5 91.3 117.8 tex2Dblur11fast +// 28.8 30.8 63.6 75.4 tex2Dblur11resize +// 33.6 36.5 40.9 45.5 tex2Dblur12x12shared +// TODO: Fill in benchmarks for new untested blurs. +// tex2Dblur17fast +// tex2Dblur25fast +// tex2Dblur31fast +// tex2Dblur43fast +// tex2Dblur3x3resize + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// Set static standard deviations, but allow users to override them with their +// own constants (even non-static uniforms if they're okay with the speed hit): +#ifndef OVERRIDE_BLUR_STD_DEVS + // blurN_std_dev values are specified in terms of dxdy strides. + #ifdef USE_BINOMIAL_BLUR_STD_DEVS + // By request, we can define standard deviations corresponding to a + // binomial distribution with p = 0.5 (related to Pascal's triangle). + // This distribution works such that blurring multiple times should + // have the same result as a single larger blur. These values are + // larger than default for blurs up to 6x and smaller thereafter. + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; + #else + // The defaults are the largest values that keep the largest unused + // blur term on each side <= 1.0/256.0. (We could get away with more + // or be more conservative, but this compromise is pretty reasonable.) + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; + #endif // USE_BINOMIAL_BLUR_STD_DEVS +#endif // OVERRIDE_BLUR_STD_DEVS + +#ifndef OVERRIDE_ERROR_BLURRING + // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing + // in shared-sample blurs but increase blurring and feature shifting. + static const float error_blurring = 0.5; +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +// gamma-management.h relies on pass-specific settings to guide its behavior: +// FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. +//#include "gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//#include "quad-pixel-communication.h" + +/////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION ////////////////////// + +#ifndef QUAD_PIXEL_COMMUNICATION_H +#define QUAD_PIXEL_COMMUNICATION_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey* +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DISCLAIMER ///////////////////////////////// + +// *This code was inspired by "Shader Amortization using Pixel Quad Message +// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent +// is not to plagiarize his fundamentally similar code and assert my own +// copyright, but the algorithmic helper functions require so little code that +// implementations can't vary by much except bugfixes and conventions. I just +// wanted to license my own particular code here to avoid ambiguity and make it +// clear that as far as I'm concerned, people can do as they please with it. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// Given screen pixel numbers, derive a "quad vector" describing a fragment's +// position in its 2x2 pixel quad. Given that vector, obtain the values of any +// variable at neighboring fragments. +// Requires: Using this file in general requires: +// 1.) ddx() and ddy() are present in the current Cg profile. +// 2.) The GPU driver is using fine/high-quality derivatives. +// Functions will give incorrect results if this is not true, +// so a test function is included. + + +///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// + +float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Two measures of the current fragment's output pixel number + // in the range ([0, output_size.x), [0, output_size.y)): + // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords. + // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy. + // Returns: Two measures of the fragment's position in its 2x2 quad: + // 1.) The .xy components are its 2x2 placement with respect to + // uv direction (the origin (0, 0) is at the top-left): + // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0) + // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0) + // You need this to arrange/weight shared texture samples. + // 2.) The .zw components are its 2x2 placement with respect to + // screen xy direction (position); the origin varies. + // quad_gather needs this measure to work correctly. + // Note: quad_vector.zw = quad_vector.xy * float2( + // ddx(output_pixel_num_wrt_uvxy.x), + // ddy(output_pixel_num_wrt_uvxy.y)); + // Caveats: This function assumes the GPU driver always starts 2x2 pixel + // quads at even pixel numbers. This assumption can be wrong + // for odd output resolutions (nondeterministically so). + float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + float4 quad_vector = pixel_odd * 2.0 - float4(1.0); + return quad_vector; +} + +float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy) +{ + // Requires: Same as get_quad_vector_naive() (see that first). + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + float4 quad_vector_guess = + get_quad_vector_naive(output_pixel_num_wrt_uvxy); + // If quad_vector_guess.zw doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), + ddy(quad_vector_guess.w)); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +float4 get_quad_vector(float2 output_pixel_num_wrt_uv) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) output_pixel_num_wrt_uv must increase with uv coords and + // measure the current fragment's output pixel number in: + // ([0, output_size.x), [0, output_size.y)) + // Returns: Same as get_quad_vector_naive() (see that first), but it's + // correct even if the 2x2 pixel quad starts at an odd pixel, + // which can occur at odd resolutions. + // Caveats: This function requires less information than the version + // taking a float4, but it's potentially slower. + // Do screen coords increase with or against uv? Get the direction + // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. + float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), + ddy(output_pixel_num_wrt_uv.y)); + float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + // If quad_vector_screen_guess doesn't increase with screen xy, we know + // the 2x2 pixel quad starts at an odd pixel: + float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), + ddy(quad_vector_screen_guess.y)); + float4 quad_vector_guess = float4( + quad_vector_uv_guess, quad_vector_screen_guess); + return quad_vector_guess * odd_start_mirror.xyxy; +} + +void quad_gather(float4 quad_vector, float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) The GPU driver is using fine/high-quality derivatives. + // 3.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 4.) curr is any vector you wish to get neighboring values of. + // Returns: Values of an input vector (curr) at neighboring fragments + // adjacent x, adjacent y, and diagonal (via out parameters). + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) +{ + // Float3 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +void quad_gather(float4 quad_vector, float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) +{ + // Float2 version + adjx = curr - ddx(curr) * quad_vector.z; + adjy = curr - ddy(curr) * quad_vector.w; + diag = adjx - ddy(adjx) * quad_vector.w; +} + +float4 quad_gather(float4 quad_vector, float curr) +{ + // Float version: + // Returns: return.x == current + // return.y == adjacent x + // return.z == adjacent y + // return.w == diagonal + float4 all = float4(curr); + all.y = all.x - ddx(all.x) * quad_vector.z; + all.zw = all.xy - ddy(all.xy) * quad_vector.w; + return all; +} + +float4 quad_gather_sum(float4 quad_vector, float4 curr) +{ + // Requires: Same as quad_gather() + // Returns: Sum of an input vector (curr) at all fragments in a quad. + float4 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float3 quad_gather_sum(float4 quad_vector, float3 curr) +{ + // Float3 version: + float3 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float2 quad_gather_sum(float4 quad_vector, float2 curr) +{ + // Float2 version: + float2 adjx, adjy, diag; + quad_gather(quad_vector, curr, adjx, adjy, diag); + return (curr + adjx + adjy + diag); +} + +float quad_gather_sum(float4 quad_vector, float curr) +{ + // Float version: + float4 all_values = quad_gather(quad_vector, curr); + return (all_values.x + all_values.y + all_values.z + all_values.w); +} + +bool fine_derivatives_working(float4 quad_vector, float4 curr) +{ + // Requires: 1.) ddx() and ddy() are present in the current Cg profile. + // 2.) quad_vector describes the current fragment's location in + // its 2x2 pixel quad using get_quad_vector()'s conventions. + // 3.) curr must be a test vector with non-constant derivatives + // (its value should change nonlinearly across fragments). + // Returns: true if fine/hybrid/high-quality derivatives are used, or + // false if coarse derivatives are used or inconclusive + // Usage: Test whether quad-pixel communication is working! + // Method: We can confirm fine derivatives are used if the following + // holds (ever, for any value at any fragment): + // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) + // The more values we test (e.g. test a float4 two ways), the + // easier it is to demonstrate fine derivatives are working. + // TODO: Check for floating point exact comparison issues! + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); + return any(bool2(ddy_different, ddx_different)); +} + +bool fine_derivatives_working_fast(float4 quad_vector, float curr) +{ + // Requires: Same as fine_derivatives_working() + // Returns: Same as fine_derivatives_working() + // Usage: This is faster than fine_derivatives_working() but more + // likely to return false negatives, so it's less useful for + // offline testing/debugging. It's also useless as the basis + // for dynamic runtime branching as of May 2014: Derivatives + // (and quad-pixel communication) are currently disallowed in + // branches. However, future GPU's may allow you to use them + // in dynamic branches if you promise the branch condition + // evaluates the same for every fragment in the quad (and/or if + // the driver enforces that promise by making a single fragment + // control branch decisions). If that ever happens, this + // version may become a more economical choice. + float ddx_curr = ddx(curr); + float ddy_curr = ddy(curr); + float adjx = curr - ddx_curr * quad_vector.z; + return (ddy_curr != ddy(adjx)); +} + +#endif // QUAD_PIXEL_COMMUNICATION_H + +//////////////////////// END QUAD-PIXEL-COMMUNICATION /////////////////////// + +//#include "special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 uv2_to_uv4(float2 tex_uv) +{ + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); +} + +// Make a length squared helper macro (for usage with static constants): +#define LENGTH_SQ(vec) (dot(vec, vec)) + +inline float get_fast_gaussian_weight_sum_inv(const float sigma) +{ + // We can use the Gaussian integral to calculate the asymptotic weight for + // the center pixel. Since the unnormalized center pixel weight is 1.0, + // the normalized weight is the same as the weight sum inverse. Given a + // large enough blur (9+), the asymptotic weight sum is close and faster: + // center_weight = 0.5 * + // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0)))) + // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)): + // However, we can get even faster results with curve-fitting. These are + // also closer than the asymptotic results, because they were constructed + // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from + // (0, blurN_std_dev), so the results for smaller sigmas are biased toward + // smaller blurs. The max error is 0.0031793913. + // Relative FPS: 134.3 with erf, 135.8 with curve-fitting. + //static const float temp = 0.5/sqrt(2.0); + //return erf(temp/sigma); + return min(exp(exp(0.348348412457428/ + (sigma - 0.0860587260734721))), 0.399334576340352/sigma); +} + + +//////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// + +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. + // It may be mipmapped depending on settings and dxdy. + // Calculate Gaussian blur kernel weights and a normalization factor for + // distances of 0-4, ignoring constant factors (since we're normalizing). + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Statically normalize weights, sum weighted samples, and return. Blurs are + // currently optimized for dynamic weights. + float3 sum = float3(0.0,0.0,0.0); + sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. + // It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; + return sum * weight_sum_inv; +} + + +/////////////////////////// FAST SEPARABLE BLURS /////////////////////////// + +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: 1.) Global requirements must be met (see file description). + // 2.) filter_linearN must = "true" in your .cgp file. + // 3.) For gamma-correct bilinear filtering, global + // gamma_aware_bilinear == true (from gamma-management.h) + // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w45 = w4 + w5; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + const float w45_ratio = w5/w45; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + + // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w16_17 = w16 + w17; + const float w18_19 = w18 + w19; + const float w20_21 = w20 + w21; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + const float w16_17_ratio = w17/w16_17; + const float w18_19_ratio = w19/w18_19; + const float w20_21_ratio = w21/w20_21; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb; + sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb; + sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + +//////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// + +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Global requirements must be met (see file description). + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the + // resized input. + // Description: + // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize + // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize. + const float denom_inv = 0.5/(sigma*sigma); + // Load each sample. We need all 3x3 samples. Quad-pixel communication + // won't help either: This should perform like tex2Dblur5x5, but sharing a + // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + // Statically compute Gaussian sample weights: + const float w4 = 1.0; + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); + // Weight and sum the samples: + const float3 sum = w4 * sample4 + + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); + return sum * weight_sum_inv; +} + + +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// + +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of + // 5x5 carefully selected bilinear samples. + // Description: + // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the + // bilinear sample location to reflect the true Gaussian weights for each + // underlying texel. The following diagram illustrates the relative + // locations of bilinear samples. Each sample with the same number has the + // same weight (notice the symmetry). The letters a, b, c, d distinguish + // quadrants, and the letters U, D, L, R, C (up, down, left, right, center) + // distinguish 1D directions along the line containing the pixel center: + // 6a 5a 2U 5b 6b + // 4a 3a 1U 3b 4b + // 2L 1L 0C 1R 2R + // 4c 3c 1D 3d 4d + // 6c 5c 2D 5d 6d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2, 2x1, 1x2, or 1x1 texel block: + // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4 + // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2 + // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2 + // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2 + // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4 + // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2 + // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4 + // Note there is only one C texel and only two texels for each U, D, L, or + // R sample. The center sample is effectively a nearest neighbor sample, + // and the U/D/L/R samples use 1D linear filtering. All other texels are + // read with bilinear samples somewhere within their 2x2 texel blocks. + + // COMPUTE TEXTURE COORDS: + // Statically compute sampling offsets within each 2x2 texel block, based + // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from + // the center, and reuse them independently for both dimensions. Compute + // these offsets based on the relative 1D Gaussian weights of the texels + // in question. (w1off means "Gaussian weight for the texel 1.0 texels + // away from the pixel center," etc.). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float w4off = exp(-16.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + const float texel3to4ratio = w4off/(w3off + w4off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2R1 = w3off; + const float w2R2 = w4off; + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2R1 + w2R2; + const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4; + const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4; + const float w5 = w4; + const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6)); + + // LOAD TEXTURE SAMPLES: + // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + // Sampling order doesn't seem to affect performance, so just be clear: + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2R + sample2D + sample2L + sample2U); + sum += w3 * (sample3d + sample3c + sample3b + sample3a); + sum += w4 * (sample4d + sample4c + sample4b + sample4a); + sum += w5 * (sample5d + sample5c + sample5b + sample5a); + sum += w6 * (sample6d + sample6c + sample6b + sample6a); + return sum * weight_sum_inv; +} + +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of + // 4x4 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 4a 3a 3b 4b + // 2a 1a 1b 2b + // 2c 1c 1d 2d + // 4c 3c 3d 4d + // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d, + // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c, + // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share + // the center texel): + // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4 + // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2 + // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4 + // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2 + // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4 + // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2 + // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float w3off = exp(-9.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + const float texel2to3ratio = w3off/(w2off + w3off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1abcd = 1.0; + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights. + // Split weights for shared texels between samples sharing them: + const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; + const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4; + const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = + 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4)); + + // LOAD TEXTURE SAMPLES: + // Load all 16 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = float3(0.0,0.0,0.0); + sum += w1 * (sample1a + sample1b + sample1c + sample1d); + sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); + sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); + sum += w4 * (sample4a + sample4b + sample4c + sample4d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of + // 3x3 carefully selected bilinear samples. + // Description: + // First see the description for tex2Dblur9x9(). This blur uses the same + // concept and sample/texel locations except on a smaller scale. Samples: + // 2a 1U 2b + // 1L 0C 1R + // 2c 1D 2d + // Texels: + // 2a4 2a3 1U2 2b3 2b4 + // 2a2 2a1 1U1 2b1 2b2 + // 1L2 1L1 0C1 1R1 1R2 + // 2c2 2c1 1D1 2d1 2d2 + // 2c4 2c3 1D2 2d3 2d4 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w1off = exp(-1.0 * denom_inv); + const float w2off = exp(-4.0 * denom_inv); + const float texel1to2ratio = w2off/(w1off + w2off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including x-axis-aligned: + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + + // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: + // Statically compute Gaussian texel weights for the bottom-right quadrant. + // Read underscores as "and." + const float w1R1 = w1off; + const float w1R2 = w2off; + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + // Statically add texel weights in each sample to get sample weights: + const float w0 = 1.0; + const float w1 = w1R1 + w1R2; + const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4; + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2)); + + // LOAD TEXTURE SAMPLES: + // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Statically normalize weights (so total = 1.0), and sum weighted samples. + float3 sum = w0 * sample0C; + sum += w1 * (sample1R + sample1D + sample1L + sample1U); + sum += w2 * (sample2a + sample2b + sample2c + sample2d); + return sum * weight_sum_inv; +} + +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. + // Requires: Same as tex2Dblur9() + // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of + // 2x2 carefully selected bilinear samples. + // Description: + // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This + // blur mixes concepts from both. The sample layout is as follows: + // 0a 0b + // 0c 0d + // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share + // a vertical column of texels, and samples 0a/0c and 0b/0d share a + // horizontal row of texels (all samples share the center texel): + // 0a3 0ab2 0b3 + // 0ac1 0*0 0bd1 + // 0c3 0cd2 0d3 + + // COMPUTE TEXTURE COORDS: + // Statically compute bilinear sampling offsets (details in tex2Dblur9x9). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w1off = exp(-1.0 * denom_inv); + const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); + // Statically compute texel offsets from the fragment center to each + // bilinear sample in the bottom-right quadrant, including axis-aligned: + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + + // LOAD TEXTURE SAMPLES: + // Load all 4 samples using symmetry: + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + + // SUM WEIGHTED SAMPLES: + // Weights for all samples are the same, so just average them: + return 0.25 * (sample0a + sample0b + sample0c + sample0d); +} + + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(video_size/output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); +} + + +#endif // BLUR_FUNCTIONS_H + +//////////////////////////// END BLUR-FUNCTIONS /////////////////////////// + +/////////////////////////////// BLOOM CONSTANTS ////////////////////////////// + +// Compute constants with manual inlines of the functions below: +static const float bloom_diff_thresh = 1.0/256.0; + + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + +inline float get_absolute_scale_blur_sigma(const float thresh) +{ + // Requires: 1.) min_expected_triads must be a global float. The number + // of horizontal phosphor triads in the final image must be + // >= min_allowed_viewport_triads.x for realistic results. + // 2.) bloom_approx_scale_x must be a global float equal to the + // absolute horizontal scale of BLOOM_APPROX. + // 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x + // should be <= 1.1658025090 to keep the final result < + // 0.62666015625 (the largest sigma ensuring the largest + // unused texel weight stays < 1.0/256.0 for a 3x3 blur). + // 4.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum Gaussian sigma that will blur the pass + // output as much as it would have taken to blur away + // bloom_approx_scale_x horizontal phosphor triads. + // Description: + // BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd + // use the same blur sigma as the actual phosphor bloom and scale it down + // to the current resolution with (bloom_approx_scale_x/viewport_size_x), but + // we don't know the viewport size in this pass. Instead, we'll blur as + // much as it would take to blur away min_allowed_viewport_triads.x. This + // will blur "more than necessary" if the user actually uses more triads, + // but that's not terrible either, because blurring a constant fraction of + // the viewport may better resemble a true optical bloom anyway (since the + // viewport will generally be about the same fraction of each player's + // field of view, regardless of screen size and resolution). + // Assume an extremely large viewport size for asymptotic results. + return bloom_approx_scale_x/max_viewport_size_x * + get_min_sigma_to_blur_triad( + max_viewport_size_x/min_allowed_viewport_triads.x, thresh); +} + +inline float get_center_weight(const float sigma) +{ + // Given a Gaussian blur sigma, get the blur weight for the center texel. + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return get_fast_gaussian_weight_sum_inv(sigma); + #else + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + const float w16 = exp(-256.0 * denom_inv); + const float w17 = exp(-289.0 * denom_inv); + const float w18 = exp(-324.0 * denom_inv); + const float w19 = exp(-361.0 * denom_inv); + const float w20 = exp(-400.0 * denom_inv); + const float w21 = exp(-441.0 * denom_inv); + // Note: If the implementation uses a smaller blur than the max allowed, + // the worst case scenario is that the center weight will be overestimated, + // so we'll put a bit more energy into the brightpass...no huge deal. + // Then again, if the implementation uses a larger blur than the max + // "allowed" because of dynamic branching, the center weight could be + // underestimated, which is more of a problem...consider always using + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // 43x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + + w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + // 31x blur: + const float weight_sum_inv = 1.0 / + (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + + w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + // 25x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + // 17x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + #else + // 9x blur: + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + const float center_weight = weight_sum_inv * weight_sum_inv; + return center_weight; + #endif +} + +inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // If sigma is static, we can safely branch and use the smallest blur + // that's big enough. Ignore #define hints, because we'll only use a + // large blur if we actually need it, and the branches cost nothing. + #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #else + // It's still worth branching if the profile supports dynamic branches: + // It's much faster than using a hugely excessive blur, but each branch + // eats ~1% FPS. + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + #endif + #endif + // Failed optimization notes: + // I originally created a same-size mipmapped 5-tap separable blur10 that + // could handle any sigma by reaching into lower mip levels. It was + // as fast as blur25fast for runtime sigmas and a tad faster than + // blur31fast for static sigmas, but mipmapping two viewport-size passes + // ate 10% of FPS across all codepaths, so it wasn't worth it. + #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE + if(sigma <= blur9_std_dev) + { + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur17_std_dev) + { + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur25_std_dev) + { + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + } + else if(sigma <= blur31_std_dev) + { + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + } + else + { + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + } + #else + // If we can't afford to branch, we can only guess at what blur + // size we need. Therefore, use the largest blur allowed. + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); + #else + #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); + #else + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE +} + +inline float get_bloom_approx_sigma(const float output_size_x_runtime, + const float estimated_viewport_size_x) +{ + // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x. + // This is included for dynamic codepaths just in case the + // following two globals are incorrect: + // 2.) bloom_approx_size_x_for_skip should == the same + // if PHOSPHOR_BLOOM_FAKE is #defined + // 3.) bloom_approx_size_x should == the same otherwise + // Returns: For gaussian4x4, return a dynamic small bloom sigma that's + // as close to optimal as possible given available information. + // For blur3x3, return the a static small bloom sigma that + // works well for typical cases. Otherwise, we're using simple + // bilinear filtering, so use static calculations. + // Assume the default static value. This is a compromise that ensures + // typical triads are blurred, even if unusually large ones aren't. + static const float mask_num_triads_static = + max(min_allowed_viewport_triads.x, mask_num_triads_desired_static); + const float mask_num_triads_from_size = + estimated_viewport_size_x/mask_triad_size_desired; + const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x, + lerp(mask_num_triads_from_size, mask_num_triads_desired, + mask_specify_num_triads)); + // Assume an extremely large viewport size for asymptotic results: + static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // Use the runtime num triads and output size: + const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_runtime; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_runtime/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // account for the Gaussian scanline sigma from the last pass too. + // The bloom will be too wide horizontally but tall enough vertically. + return length(float2(bloom_approx_sigma, beam_max_sigma)); + } + else // 3x3 blur resize (the bilinear resize doesn't need a sigma) + { + // We're either using blur3x3 or bilinear filtering. The biggest + // reason to choose blur3x3 is to avoid dynamic weights, so use a + // static calculation. + #ifdef PHOSPHOR_BLOOM_FAKE + static const float output_size_x_static = + bloom_approx_size_x_for_fake; + #else + static const float output_size_x_static = bloom_approx_size_x; + #endif + static const float asymptotic_triad_size = + max_viewport_size_x/mask_num_triads_static; + const float asymptotic_sigma = get_min_sigma_to_blur_triad( + asymptotic_triad_size, bloom_diff_thresh); + const float bloom_approx_sigma = + asymptotic_sigma * output_size_x_static/max_viewport_size_x; + // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but + // try accounting for the Gaussian scanline sigma from the last pass + // too; use the static default value: + return length(float2(bloom_approx_sigma, beam_max_sigma_static)); + } +} + +inline float get_final_bloom_sigma(const float bloom_sigma_runtime) +{ + // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's + // optimal for the [known] triad size. + // 2.) Call this from a fragment shader (not a vertex shader), + // or blurring with static sigmas won't be constant-folded. + // Returns: Return the optimistic static sigma if the triad size is + // known at compile time. Otherwise return the optimal runtime + // sigma (10% slower) or an implementation-specific compromise + // between an optimistic or pessimistic static sigma. + // Notes: Call this from the fragment shader, NOT the vertex shader, + // so static sigmas can be constant-folded! + const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad( + mask_triad_size_desired_static, bloom_diff_thresh); + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + return bloom_sigma_runtime; + #else + // Overblurring looks as bad as underblurring, so assume average-size + // triads, not worst-case huge triads: + return bloom_sigma_optimistic; + #endif +} + + +#endif // BLOOM_FUNCTIONS_H + +//////////////////////////// END BLOOM-FUNCTIONS /////////////////////////// + +void main() { + // This pass: Sample (misconverged?) scanlines to the final horizontal + // resolution, apply halation (bouncing electrons), and apply the phosphor + // mask. Fake a bloom if requested. Unless we fake a bloom, the output + // will be dim from the scanline auto-dim, mask dimming, and low gamma. + + // Horizontally sample the current row (a vertically interpolated scanline) + // and account for horizontal convergence offsets, given in units of texels. + const float3 scanline_color_dim = sample_rgb_scanline_horizontal( + VERTICAL_SCANLINEStexture, scanline_tex_uv, + VERTICAL_SCANLINEStexture_size, scanline_texture_size_inv); + const float auto_dim_factor = levels_autodim_temp; + + // Sample the phosphor mask: + const float2 tile_uv_wrap = video_uv * mask_tiles_per_screen; + const float2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv( + tile_uv_wrap, mask_tile_start_uv_and_size); + float3 phosphor_mask_sample; + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + const bool sample_orig_luts = get_mask_sample_mode() > 0.5; + #else + static const bool sample_orig_luts = true; + #endif + if(sample_orig_luts) + { + // If mask_type is static, this branch will be resolved statically. + if(mask_type < 0.5) + { + phosphor_mask_sample = tex2D_linearize( + mask_grille_texture_large, mask_tex_uv).rgb; + } + else if(mask_type < 1.5) + { + phosphor_mask_sample = tex2D_linearize( + mask_slot_texture_large, mask_tex_uv).rgb; + } + else + { + phosphor_mask_sample = tex2D_linearize( + mask_shadow_texture_large, mask_tex_uv).rgb; + } + } + else + { + // Sample the resized mask, and avoid tiling artifacts: + phosphor_mask_sample = tex2Dtiled_mask_linearize( + MASK_RESIZEtexture, mask_tex_uv).rgb; + } + + // Sample the halation texture (auto-dim to match the scanlines), and + // account for both horizontal and vertical convergence offsets, given + // in units of texels horizontally and same-field scanlines vertically: + const float3 halation_color = tex2D_linearize( + HALATION_BLURtexture, halation_tex_uv).rgb; + + // Apply halation: Halation models electrons flying around under the glass + // and hitting the wrong phosphors (of any color). It desaturates, so + // average the halation electrons to a scalar. Reduce the local scanline + // intensity accordingly to conserve energy. + const float3 halation_intensity_dim = + float3(dot(halation_color, float3(auto_dim_factor/3.0))); + const float3 electron_intensity_dim = lerp(scanline_color_dim, + halation_intensity_dim, halation_weight); + + // Apply the phosphor mask: + const float3 phosphor_emission_dim = electron_intensity_dim * + phosphor_mask_sample; + + #ifdef PHOSPHOR_BLOOM_FAKE + // The BLOOM_APPROX pass approximates a blurred version of a masked + // and scanlined image. It's usually used to compute the brightpass, + // but we can also use it to fake the bloom stage entirely. Caveats: + // 1.) A fake bloom is conceptually different, since we're mixing in a + // fully blurred low-res image, and the biggest implication are: + // 2.) If mask_amplify is incorrect, results deteriorate more quickly. + // 3.) The inaccurate blurring hurts quality in high-contrast areas. + // 4.) The bloom_underestimate_levels parameter seems less sensitive. + // Reverse the auto-dimming and amplify to compensate for mask dimming: + #define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND + #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND + static const float blur_contrast = 1.05; + #else + static const float blur_contrast = 1.0; + #endif + const float mask_amplify = get_mask_amplify(); + const float undim_factor = 1.0/auto_dim_factor; + const float3 phosphor_emission = + phosphor_emission_dim * undim_factor * mask_amplify; + // Get a phosphor blur estimate, accounting for convergence offsets: + const float3 electron_intensity = electron_intensity_dim * undim_factor; + const float3 phosphor_blur_approx_soft = tex2D_linearize( + BLOOM_APPROXtexture, blur3x3_tex_uv).rgb; + const float3 phosphor_blur_approx = lerp(phosphor_blur_approx_soft, + electron_intensity, 0.1) * blur_contrast; + // We could blend between phosphor_emission and phosphor_blur_approx, + // solving for the minimum blend_ratio that avoids clipping past 1.0: + // 1.0 >= total_intensity + // 1.0 >= phosphor_emission * (1.0 - blend_ratio) + + // phosphor_blur_approx * blend_ratio + // blend_ratio = (phosphor_emission - 1.0)/ + // (phosphor_emission - phosphor_blur_approx); + // However, this blurs far more than necessary, because it aims for + // full brightness, not minimal blurring. To fix it, base blend_ratio + // on a max area intensity only so it varies more smoothly: + const float3 phosphor_blur_underestimate = + phosphor_blur_approx * bloom_underestimate_levels; + const float3 area_max_underestimate = + phosphor_blur_underestimate * mask_amplify; + #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND + const float3 blend_ratio_temp = + (area_max_underestimate - float3(1.0, 1.0, 1.0)) / + (area_max_underestimate - phosphor_blur_underestimate); + #else + // Try doing it like an area-based brightpass. This is nearly + // identical, but it's worth toying with the code in case I ever + // find a way to make it look more like a real bloom. (I've had + // some promising textures from combining an area-based blend ratio + // for the phosphor blur and a more brightpass-like blend-ratio for + // the phosphor emission, but I haven't found a way to make the + // brightness correct across the whole color range, especially with + // different bloom_underestimate_levels values.) + const float desired_triad_size = lerp(mask_triad_size_desired, + output_size.x/mask_num_triads_desired, + mask_specify_num_triads); + const float bloom_sigma = get_min_sigma_to_blur_triad( + desired_triad_size, bloom_diff_thresh); + const float center_weight = get_center_weight(bloom_sigma); + const float3 max_area_contribution_approx = + max(float3(0.0, 0.0, 0.0), phosphor_blur_approx - + center_weight * phosphor_emission); + const float3 area_contrib_underestimate = + bloom_underestimate_levels * max_area_contribution_approx; + const float3 blend_ratio_temp = + ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) / + area_max_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0); + #endif + // Clamp blend_ratio in case it's out-of-range, but be SUPER careful: + // min/max/clamp are BIZARRELY broken with lerp (optimization bug?), + // and this redundant sequence avoids bugs, at least on nVidia cards: + const float3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0); + const float3 blend_ratio = lerp(blend_ratio_clamped, float3(1.0,1.0,1.0), bloom_excess); + // Blend the blurred and unblurred images: + const float3 phosphor_emission_unclipped = + lerp(phosphor_emission, phosphor_blur_approx, blend_ratio); + // Simulate refractive diffusion by reusing the halation sample. + const float3 pixel_color = lerp(phosphor_emission_unclipped, + halation_color, diffusion_weight); + #else + const float3 pixel_color = phosphor_emission_dim; + #endif + // Encode if necessary, and output. + FragColor = encode_output(float4(pixel_color, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs new file mode 100644 index 00000000..41e6f7c1 --- /dev/null +++ b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs @@ -0,0 +1,6047 @@ +#version 150 + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 video_uv; + vec2 scanline_tex_uv; + vec2 blur3x3_tex_uv; + vec2 halation_tex_uv; + vec2 scanline_texture_size_inv; + vec4 mask_tile_start_uv_and_size; + vec2 mask_tiles_per_screen; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; +uniform int phase; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +#define VERTICAL_SCANLINEStexture source[5] +#define VERTICAL_SCANLINEStexture_size sourceSize[5].xy +#define VERTICAL_SCANLINESvideo_size sourceSize[5].xy +#define BLOOM_APPROXtexture source[3] +#define BLOOM_APPROXtexture_size sourceSize[3].xy +#define BLOOM_APPROXvideo_size sourceSize[3].xy +#define HALATION_BLURtexture source[1] +#define HALATION_BLURtexture_size sourceSize[1].xy +#define HALATION_BLURvideo_size sourceSize[1].xy +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #define MASK_RESIZEtexture source[0] +#else + #define MASK_RESIZEtexture source[0] +#endif +#define MASK_RESIZEtexture_size sourceSize[0].xy +#define MASK_RESIZEvideo_size sourceSize[0].xy + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +// VERTEX INCLUDES // + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +//#include "phosphor-mask-resizing.h" + +//////////////////////// BEGIN PHOSPHOR-MASK-RESIZING //////////////////////// + +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" +//#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +static const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +static const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +static const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + static const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + static const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + static const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + float2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + +///////////////////////// END PHOSPHOR-MASK-RESIZING ///////////////////////// + +//#include "../../../../include/gamma-management.h" +// already got it + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 tex2Dtiled_mask_linearize(const sampler2D tex, + const float2 tex_uv) +{ + // If we're manually tiling a texture, anisotropic filtering can get + // confused. One workaround is to just select the lowest mip level: + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + // TODO: Use tex2Dlod_linearize with a calculated mip level. + return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0)); + #else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0)); + #else + return tex2D_linearize(tex, tex_uv); + #endif + #endif + #else + return tex2D_linearize(tex, tex_uv); + #endif +} + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +// END VERTEX INCLUDES // + +float bloom_approx_scale_x = targetSize.x / sourceSize[0].y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + +void main() { + gl_Position = position; + vTexCoord = texCoord * 1.00001; + + float2 tex_uv = vTexCoord.xy; + // Our various input textures use different coords. + video_uv = tex_uv * texture_size/video_size; + scanline_texture_size_inv = + float2(1.0, 1.0)/VERTICAL_SCANLINEStexture_size; + //video_uv = video_uv; + scanline_tex_uv = video_uv * VERTICAL_SCANLINESvideo_size * + scanline_texture_size_inv; + blur3x3_tex_uv = video_uv * BLOOM_APPROXvideo_size / + BLOOM_APPROXtexture_size; + halation_tex_uv = video_uv * HALATION_BLURvideo_size / + HALATION_BLURtexture_size; + //scanline_texture_size_inv = scanline_texture_size_inv; + + // Get a consistent name for the final mask texture size. Sample mode 0 + // uses the manually resized mask, but ignore it if we never resized. + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_texture_size = mask_sample_mode < 0.5 ? + MASK_RESIZEtexture_size : mask_texture_large_size; + const float2 mask_resize_video_size = mask_sample_mode < 0.5 ? + MASK_RESIZEvideo_size : mask_texture_large_size; + #else + const float2 mask_resize_texture_size = mask_texture_large_size; + const float2 mask_resize_video_size = mask_texture_large_size; + #endif + // Compute mask tile dimensions, starting points, etc.: + //float2 mask_tiles_per_screen; + mask_tile_start_uv_and_size = get_mask_sampling_parameters( + mask_resize_texture_size, mask_resize_video_size, output_size, + mask_tiles_per_screen); + //mask_tiles_per_screen = mask_tiles_per_screen; +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs new file mode 100644 index 00000000..d090c529 --- /dev/null +++ b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs @@ -0,0 +1,5963 @@ +#version 150 + +uniform sampler2D source[]; +uniform vec4 sourceSize[]; +uniform vec4 targetSize; +uniform int phase; + +in Vertex { + vec2 vTexCoord; + vec2 uv_step; + vec2 il_step_multiple; + float pixel_height_in_scanlines; +}; + +out vec4 FragColor; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 0.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-params.h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + vec2 tex_uv = vTexCoord.xy; + // This pass: Sample multiple (misconverged?) scanlines to the final + // vertical resolution. Temporarily auto-dim the output to avoid clipping. + + // Read some attributes into local variables: + float2 texture_size_ = texture_size; + float2 texture_size_inv = 1.0/texture_size_; + //const float2 uv_step = uv_step; + //const float2 il_step_multiple = il_step_multiple; + float frame_count = float(frame_count); + const float ph = pixel_height_in_scanlines; + + // Get the uv coords of the previous scanline (in this field), and the + // scanline's distance from this sample, in scanlines. + float dist; + const float2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size_, + texture_size_inv, il_step_multiple, frame_count, dist); + // Consider 2, 3, 4, or 6 scanlines numbered 0-5: The previous and next + // scanlines are numbered 2 and 3. Get scanline colors colors (ignore + // horizontal sampling, since since output_size.x = video_size.x). + // NOTE: Anisotropic filtering creates interlacing artifacts, which is why + // ORIG_LINEARIZED bobbed any interlaced input before this pass. + const float2 v_step = float2(0.0, uv_step.y); + const float3 scanline2_color = tex2D_linearize(input_texture, scanline_uv).rgb; + const float3 scanline3_color = + tex2D_linearize(input_texture, scanline_uv + v_step).rgb; + float3 scanline0_color, scanline1_color, scanline4_color, scanline5_color, + scanline_outside_color; + float dist_round; + // Use scanlines 0, 1, 4, and 5 for a total of 6 scanlines: + if(beam_num_scanlines > 5.5) + { + scanline1_color = + tex2D_linearize(input_texture, scanline_uv - v_step).rgb; + scanline4_color = + tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb; + scanline0_color = + tex2D_linearize(input_texture, scanline_uv - 2.0 * v_step).rgb; + scanline5_color = + tex2D_linearize(input_texture, scanline_uv + 3.0 * v_step).rgb; + } + // Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines: + else if(beam_num_scanlines > 4.5) + { + scanline1_color = + tex2D_linearize(input_texture, scanline_uv - v_step).rgb; + scanline4_color = + tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb; + // dist is in [0, 1] + dist_round = round(dist); + const float2 sample_0_or_5_uv_off = + lerp(-2.0 * v_step, 3.0 * v_step, dist_round); + // Call this "scanline_outside_color" to cope with the conditional + // scanline number: + scanline_outside_color = tex2D_linearize( + input_texture, scanline_uv + sample_0_or_5_uv_off).rgb; + } + // Use scanlines 1 and 4 for a total of 4 scanlines: + else if(beam_num_scanlines > 3.5) + { + scanline1_color = + tex2D_linearize(input_texture, scanline_uv - v_step).rgb; + scanline4_color = + tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb; + } + // Use scanline 1 or 4 for a total of 3 scanlines: + else if(beam_num_scanlines > 2.5) + { + // dist is in [0, 1] + dist_round = round(dist); + const float2 sample_1or4_uv_off = + lerp(-v_step, 2.0 * v_step, dist_round); + scanline_outside_color = tex2D_linearize( + input_texture, scanline_uv + sample_1or4_uv_off).rgb; + } + + // Compute scanline contributions, accounting for vertical convergence. + // Vertical convergence offsets are in units of current-field scanlines. + // dist2 means "positive sample distance from scanline 2, in scanlines:" + float3 dist2 = float3(dist); + if(beam_misconvergence) + { + const float3 convergence_offsets_vert_rgb = + get_convergence_offsets_y_vector(); + dist2 = float3(dist) - convergence_offsets_vert_rgb; + } + // Calculate {sigma, shape}_range outside of scanline_contrib so it's only + // done once per pixel (not 6 times) with runtime params. Don't reuse the + // vertex shader calculations, so static versions can be constant-folded. + const float sigma_range = max(beam_max_sigma, beam_min_sigma) - + beam_min_sigma; + const float shape_range = max(beam_max_shape, beam_min_shape) - + beam_min_shape; + // Calculate and sum final scanline contributions, starting with lines 2/3. + // There is no normalization step, because we're not interpolating a + // continuous signal. Instead, each scanline is an additive light source. + const float3 scanline2_contrib = scanline_contrib(dist2, + scanline2_color, ph, sigma_range, shape_range); + const float3 scanline3_contrib = scanline_contrib(abs(float3(1.0,1.0,1.0) - dist2), + scanline3_color, ph, sigma_range, shape_range); + float3 scanline_intensity = scanline2_contrib + scanline3_contrib; + if(beam_num_scanlines > 5.5) + { + const float3 scanline0_contrib = + scanline_contrib(dist2 + float3(2.0,2.0,2.0), scanline0_color, + ph, sigma_range, shape_range); + const float3 scanline1_contrib = + scanline_contrib(dist2 + float3(1.0,1.0,1.0), scanline1_color, + ph, sigma_range, shape_range); + const float3 scanline4_contrib = + scanline_contrib(abs(float3(2.0,2.0,2.0) - dist2), scanline4_color, + ph, sigma_range, shape_range); + const float3 scanline5_contrib = + scanline_contrib(abs(float3(3.0) - dist2), scanline5_color, + ph, sigma_range, shape_range); + scanline_intensity += scanline0_contrib + scanline1_contrib + + scanline4_contrib + scanline5_contrib; + } + else if(beam_num_scanlines > 4.5) + { + const float3 scanline1_contrib = + scanline_contrib(dist2 + float3(1.0,1.0,1.0), scanline1_color, + ph, sigma_range, shape_range); + const float3 scanline4_contrib = + scanline_contrib(abs(float3(2.0,2.0,2.0) - dist2), scanline4_color, + ph, sigma_range, shape_range); + const float3 dist0or5 = lerp( + dist2 + float3(2.0,2.0,2.0), float3(3.0,3.0,3.0) - dist2, dist_round); + const float3 scanline0or5_contrib = scanline_contrib( + dist0or5, scanline_outside_color, ph, sigma_range, shape_range); + scanline_intensity += scanline1_contrib + scanline4_contrib + + scanline0or5_contrib; + } + else if(beam_num_scanlines > 3.5) + { + const float3 scanline1_contrib = + scanline_contrib(dist2 + float3(1.0,1.0,1.0), scanline1_color, + ph, sigma_range, shape_range); + const float3 scanline4_contrib = + scanline_contrib(abs(float3(2.0,2.0,2.0) - dist2), scanline4_color, + ph, sigma_range, shape_range); + scanline_intensity += scanline1_contrib + scanline4_contrib; + } + else if(beam_num_scanlines > 2.5) + { + const float3 dist1or4 = lerp( + dist2 + float3(1.0,1.0,1.0), float3(2.0,2.0,2.0) - dist2, dist_round); + const float3 scanline1or4_contrib = scanline_contrib( + dist1or4, scanline_outside_color, ph, sigma_range, shape_range); + scanline_intensity += scanline1or4_contrib; + } + + // Auto-dim the image to avoid clipping, encode if necessary, and output. + // My original idea was to compute a minimal auto-dim factor and put it in + // the alpha channel, but it wasn't working, at least not reliably. This + // is faster anyway, levels_autodim_temp = 0.5 isn't causing banding. + FragColor = encode_output(float4(scanline_intensity * levels_autodim_temp, 1.0)); +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs new file mode 100644 index 00000000..8fe7b14c --- /dev/null +++ b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs @@ -0,0 +1,5830 @@ +#version 150 + +in vec4 position; +in vec2 texCoord; + +out Vertex { + vec2 vTexCoord; + vec2 uv_step; + vec2 il_step_multiple; + float pixel_height_in_scanlines; +}; + +uniform vec4 targetSize; +uniform vec4 sourceSize[]; + +// USER SETTINGS BLOCK // + +#define crt_gamma 2.500000 +#define lcd_gamma 2.200000 +#define levels_contrast 1.0 +#define halation_weight 0.0 +#define diffusion_weight 0.075 +#define bloom_underestimate_levels 0.8 +#define bloom_excess 0.000000 +#define beam_min_sigma 0.020000 +#define beam_max_sigma 0.300000 +#define beam_spot_power 0.330000 +#define beam_min_shape 2.000000 +#define beam_max_shape 4.000000 +#define beam_shape_power 0.250000 +#define beam_horiz_filter 0.000000 +#define beam_horiz_sigma 0.35 +#define beam_horiz_linear_rgb_weight 1.000000 +#define convergence_offset_x_r -0.000000 +#define convergence_offset_x_g 0.000000 +#define convergence_offset_x_b 0.000000 +#define convergence_offset_y_r 0.000000 +#define convergence_offset_y_g -0.000000 +#define convergence_offset_y_b 0.000000 +#define mask_type 1.000000 +#define mask_sample_mode_desired 0.000000 +#define mask_specify_num_triads 0.000000 +#define mask_triad_size_desired 3.000000 +#define mask_num_triads_desired 480.000000 +#define aa_subpixel_r_offset_x_runtime -0.0 +#define aa_subpixel_r_offset_y_runtime 0.000000 +#define aa_cubic_c 0.500000 +#define aa_gauss_sigma 0.500000 +#define geom_mode_runtime 2.000000 +#define geom_radius 2.000000 +#define geom_view_dist 2.000000 +#define geom_tilt_angle_x 0.000000 +#define geom_tilt_angle_y 0.000000 +#define geom_aspect_ratio_x 432.000000 +#define geom_aspect_ratio_y 329.000000 +#define geom_overscan_x 1.000000 +#define geom_overscan_y 1.000000 +#define border_size 0.015 +#define border_darkness 2.0 +#define border_compress 2.500000 +#define interlace_bff 0.000000 +#define interlace_1080i 0.000000 + +// END USER SETTINGS BLOCK // + +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size sourceSize[0].xy +#define video_size sourceSize[0].xy +#define output_size targetSize.xy +#define frame_count phase +#define static +#define inline +#define const +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) + +#define input_texture source[0] + +#if defined(GL_ES) + #define COMPAT_PRECISION mediump +#else + #define COMPAT_PRECISION +#endif + +#if __VERSION__ >= 130 + #define COMPAT_TEXTURE texture +#else + #define COMPAT_TEXTURE texture2D +#endif + +////////////////////////////////// INCLUDES ////////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "bind-shader-params.h" + +///////////////////////////// BEGIN BIND-SHADER-PARAMS //////////////////////////// + +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "derived-settings-and-constants.h" + +///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +//////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ///////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Provide accessors for vector constants that pack scalar uniforms: +inline float2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +inline float2 get_geom_overscan_vector() +{ + return float2(geom_overscan_x, geom_overscan_y); +} + +inline float2 get_geom_tilt_angle_vector() +{ + return float2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +inline float3 get_convergence_offsets_x_vector() +{ + return float3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +inline float3 get_convergence_offsets_y_vector() +{ + return float3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +inline float2 get_convergence_offsets_r_vector() +{ + return float2(convergence_offset_x_r, convergence_offset_y_r); +} + +inline float2 get_convergence_offsets_g_vector() +{ + return float2(convergence_offset_x_g, convergence_offset_y_g); +} + +inline float2 get_convergence_offsets_b_vector() +{ + return float2(convergence_offset_x_b, convergence_offset_y_b); +} + +inline float2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return float2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +inline float get_mask_amplify() +{ + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +inline float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + +#endif // BIND_SHADER_PARAMS_H + +//////////////////////////// END BIND-SHADER-PARAMS /////////////////////////// + +//#include "scanline-functions.h" + +///////////////////////////// BEGIN SCANLINE-FUNCTIONS //////////////////////////// + +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +//////////////////////////// END USER-SETTINGS ////////////////////////// + +//#include "derived-settings-and-constants.h" + +//////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS //////////////////// + +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +/////////////////////////////// BEGIN INCLUDES /////////////////////////////// + +//#include "../user-settings.h" + +///////////////////////////// BEGIN USER-SETTINGS //////////////////////////// + +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0 + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + +///////////////////////////// END USER-SETTINGS //////////////////////////// + +//#include "user-cgp-constants.h" + +///////////////////////// BEGIN USER-CGP-CONSTANTS ///////////////////////// + +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +static const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +static const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +static const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +static const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +static const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + static const float mask_grille_avg_color = mask_grille14_avg_color; +#else + static const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + +////////////////////////// END USER-CGP-CONSTANTS ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + static const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + static const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + static const float2 mask_resize_src_lut_size = mask_texture_large_size; + #else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + static const float2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + static const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + static const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + static const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + static const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + static const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: +static const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; + #else + static const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; + #endif +#else + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): +static const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + +static const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS //////////////////////////// + +//#include "../../../../include/special-functions.h" + +/////////////////////////// BEGIN SPECIAL-FUNCTIONS ////////////////////////// + +#ifndef SPECIAL_FUNCTIONS_H +#define SPECIAL_FUNCTIONS_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file implements the following mathematical special functions: +// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2)) +// 2.) gamma(s), a real-numbered extension of the integer factorial function +// It also implements normalized_ligamma(s, z), a normalized lower incomplete +// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can +// be called with an _impl suffix to use an implementation version with a few +// extra precomputed parameters (which may be useful for the caller to reuse). +// See below for details. +// +// Design Rationale: +// Pretty much every line of code in this file is duplicated four times for +// different input types (float4/float3/float2/float). This is unfortunate, +// but Cg doesn't allow function templates. Macros would be far less verbose, +// but they would make the code harder to document and read. I don't expect +// these functions will require a whole lot of maintenance changes unless +// someone ever has need for more robust incomplete gamma functions, so code +// duplication seems to be the lesser evil in this case. + + +/////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// + +float4 erf6(float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Return an Abramowitz/Stegun approximation of erf(), where: + // erf(x) = 2/sqrt(pi) * integral(e**(-x**2)) + // This approximation has a max absolute error of 2.5*10**-5 + // with solid numerical robustness and efficiency. See: + // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float3 erf6(const float3 x) +{ + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float2 erf6(const float2 x) +{ + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float erf6(const float x) +{ + // Float version: + const float sign_x = sign(x); + const float t = 1.0/(1.0 + 0.47047*abs(x)); + const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + exp(-(x*x)); + return result * sign_x; +} + +float4 erft(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Approximate erf() with the hyperbolic tangent. The error is + // visually noticeable, but it's blazing fast and perceptually + // close...at least on ATI hardware. See: + // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html + // Warning: Only use this if your hardware drivers correctly implement + // tanh(): My nVidia 8800GTS returns garbage output. + return tanh(1.202760580 * x); +} + +float3 erft(const float3 x) +{ + // Float3 version: + return tanh(1.202760580 * x); +} + +float2 erft(const float2 x) +{ + // Float2 version: + return tanh(1.202760580 * x); +} + +float erft(const float x) +{ + // Float version: + return tanh(1.202760580 * x); +} + +inline float4 erf(const float4 x) +{ + // Requires: x is the standard parameter to erf(). + // Returns: Some approximation of erf(x), depending on user settings. + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float3 erf(const float3 x) +{ + // Float3 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float2 erf(const float2 x) +{ + // Float2 version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + +inline float erf(const float x) +{ + // Float version: + #ifdef ERF_FAST_APPROXIMATION + return erft(x); + #else + return erf6(x); + #endif +} + + +/////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// + +float4 gamma_impl(const float4 s, const float4 s_inv) +{ + // Requires: 1.) s is the standard parameter to the gamma function, and + // it should lie in the [0, 36] range. + // 2.) s_inv = 1.0/s. This implementation function requires + // the caller to precompute this value, giving users the + // opportunity to reuse it. + // Returns: Return approximate gamma function (real-numbered factorial) + // output using the Lanczos approximation with two coefficients + // calculated using Paul Godfrey's method here: + // http://my.fit.edu/~gabdo/gamma.txt + // An optimal g value for s in [0, 36] is ~1.12906830989, with + // a maximum relative error of 0.000463 for 2**16 equally + // evals. We could use three coeffs (0.0000346 error) without + // hurting latency, but this allows more parallelism with + // outside instructions. + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e + // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). + // This has less error for small s's than (s -= 1.0) at the beginning. + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float3 gamma_impl(const float3 s, const float3 s_inv) +{ + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float2 gamma_impl(const float2 s, const float2 s_inv) +{ + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float gamma_impl(const float s, const float s_inv) +{ + // Float version: + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; + const float sph = s + 0.5; + const float lanczos_sum = c0 + c1/(s + 1.0); + const float base = (sph + g)/e; + return (pow(base, sph) * lanczos_sum) * s_inv; +} + +float4 gamma(const float4 s) +{ + // Requires: s is the standard parameter to the gamma function, and it + // should lie in the [0, 36] range. + // Returns: Return approximate gamma function output with a maximum + // relative error of 0.000463. See gamma_impl for details. + return gamma_impl(s, float4(1.0)/s); +} + +float3 gamma(const float3 s) +{ + // Float3 version: + return gamma_impl(s, float3(1.0)/s); +} + +float2 gamma(const float2 s) +{ + // Float2 version: + return gamma_impl(s, float2(1.0)/s); +} + +float gamma(const float s) +{ + // Float version: + return gamma_impl(s, 1.0/s); +} + + +//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// + +// Lower incomplete gamma function for small s and z (implementation): +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) z <= ~0.775075 + // 3.) s_inv = 1.0/s (precomputed for outside reuse) + // Returns: A series representation for the lower incomplete gamma + // function for small s and small z (4 terms). + // The actual "rolled up" summation looks like: + // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0; + // sum = last_sign * last_pow / ((s + k) * last_factorial) + // for(int i = 0; i < 4; ++i) + // { + // last_sign *= -1.0; last_pow *= z; last_factorial *= i; + // sum += last_sign * last_pow / ((s + k) * last_factorial); + // } + // Unrolled, constant-unfolded and arranged for madds and parallelism: + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result + // Summation iterations 1, 2, and 3: + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + //sum += z_sq * z_sq / denom4; + // Scale and return: + return scale * sum; +} + +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) +{ + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) +{ + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +float ligamma_small_z_impl(const float s, const float z, const float s_inv) +{ + // Float version: + const float scale = pow(z, s); + float sum = s_inv; + const float z_sq = z*z; + const float denom1 = s + 1.0; + const float denom2 = 2.0*s + 4.0; + const float denom3 = 6.0*s + 18.0; + sum -= z/denom1; + sum += z_sq/denom2; + sum -= z * z_sq/denom3; + return scale * sum; +} + +// Upper incomplete gamma function for small s and large z (implementation): +float4 uigamma_large_z_impl(const float4 s, const float4 z) +{ + // Requires: 1.) s < ~0.5 + // 2.) z > ~0.775075 + // Returns: Gauss's continued fraction representation for the upper + // incomplete gamma function (4 terms). + // The "rolled up" continued fraction looks like this. The denominator + // is truncated, and it's calculated "from the bottom up:" + // denom = float4('inf'); + // float4 one = float4(1.0); + // for(int i = 4; i > 0; --i) + // { + // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; + // } + // Unrolled and constant-unfolded for madds and parallelism: + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; + return numerator / denom; +} + +float3 uigamma_large_z_impl(const float3 s, const float3 z) +{ + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; + return numerator / denom; +} + +float2 uigamma_large_z_impl(const float2 s, const float2 z) +{ + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; + return numerator / denom; +} + +float uigamma_large_z_impl(const float s, const float z) +{ + // Float version: + const float numerator = pow(z, s) * exp(-z); + float denom = 7.0 + z - s; + denom = 5.0 + z - s + (3.0*s - 9.0)/denom; + denom = 3.0 + z - s + (2.0*s - 4.0)/denom; + denom = 1.0 + z - s + (s - 1.0)/denom; + return numerator / denom; +} + +// Normalized lower incomplete gamma function for small s (implementation): +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) +{ + // Requires: 1.) s < ~0.5 + // 2.) s_inv = 1/s (precomputed for outside reuse) + // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse) + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. Since we only care about s < 0.5, we only need + // to evaluate two branches (not four) based on z. Each branch + // uses four terms, with a max relative error of ~0.00182. The + // branch threshold and specifics were adapted for fewer terms + // from Gil/Segura/Temme's paper here: + // http://oai.cwi.nl/oai/asset/20433/20433B.pdf + // Evaluate both branches: Real branches test slower even when available. + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + // Combine the results from both branches: + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); +} + +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) +{ + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); +} + +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) +{ + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); +} + +float normalized_ligamma_impl(const float s, const float z, + const float s_inv, const float gamma_s_inv) +{ + // Float version: + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; + const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + return large_z * float(z_is_large) + small_z * float(!z_is_large); +} + +// Normalized lower incomplete gamma function for small s: +float4 normalized_ligamma(const float4 s, const float4 z) +{ + // Requires: s < ~0.5 + // Returns: Approximate the normalized lower incomplete gamma function + // for s < 0.5. See normalized_ligamma_impl() for details. + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float3 normalized_ligamma(const float3 s, const float3 z) +{ + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float2 normalized_ligamma(const float2 s, const float2 z) +{ + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +float normalized_ligamma(const float s, const float z) +{ + // Float version: + const float s_inv = 1.0/s; + const float gamma_s_inv = 1.0/gamma_impl(s, s_inv); + return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); +} + +#endif // SPECIAL_FUNCTIONS_H + +//////////////////////////// END SPECIAL-FUNCTIONS /////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +//////////////////////////////// END INCLUDES //////////////////////////////// + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// + +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(float3 dist, + float3 color, float pixel_height, float sigma_range, + float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb; + const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb; + color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + +#endif // SCANLINE_FUNCTIONS_H + +///////////////////////////// END SCANLINE-FUNCTIONS //////////////////////////// + +//#include "../../../../include/gamma-management.h" + +//////////////////////////// BEGIN GAMMA-MANAGEMENT ////////////////////////// + +#ifndef GAMMA_MANAGEMENT_H +#define GAMMA_MANAGEMENT_H + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + static const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } +#else + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + inline float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } + #endif +#else + static const bool linearize_input = true; + static const bool gamma_encode_output = true; + #ifdef FIRST_PASS + inline float get_pass_input_gamma() { return get_input_gamma(); } + #else + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + inline float get_pass_output_gamma() { return get_output_gamma(); } + #else + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_input(const float4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); + } + else + { + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ +//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D))) +// EDIT: it's the 'const' in front of the coords that's doing it + +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +*/ +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords) +{ return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } + +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } + + +#endif // GAMMA_MANAGEMENT_H + +//////////////////////////// END GAMMA-MANAGEMENT ////////////////////////// + +#undef COMPAT_PRECISION +#undef COMPAT_TEXTURE + +void main() { + gl_Position = position; + vTexCoord = texCoord * 1.0001; + + // Detect interlacing: il_step_multiple indicates the step multiple between + // lines: 1 is for progressive sources, and 2 is for interlaced sources. + float2 video_size_ = video_size.xy; + const float y_step = 1.0 + float(is_interlaced(video_size_.y)); + il_step_multiple = float2(1.0, y_step); + // Get the uv tex coords step between one texel (x) and scanline (y): + uv_step = il_step_multiple / texture_size; + + // If shader parameters are used, {min, max}_{sigma, shape} are runtime + // values. Compute {sigma, shape}_range outside of scanline_contrib() so + // they aren't computed once per scanline (6 times per fragment and up to + // 18 times per vertex): + // TODO/FIXME: if these aren't used, why are they calculated? commenting for now +// const floatsigma_range = max(beam_max_sigma, beam_min_sigma) - +// beam_min_sigma; +// const float shape_range = max(beam_max_shape, beam_min_shape) - +// beam_min_shape; + + // We need the pixel height in scanlines for antialiased/integral sampling: + const float ph = (video_size_.y / output_size.y) / + il_step_multiple.y; + pixel_height_in_scanlines = ph; +} \ No newline at end of file diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png b/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png new file mode 100644 index 0000000000000000000000000000000000000000..2995ae5f4cd1c7f0c72732286e49f255c641ac5c GIT binary patch literal 198848 zcmWh!1ytNz6J6Y8ahF}(i@U?(PSN7l;#%Bgai_Q!Z=rZU9G2qNQmp6}D^T3Kus{FF zNnVnZ^D=qw&Sc)5nM6Gu2mvk)E&u=^P*+nn002;5PEi5a|8GKQls>&Q=*|k-3ILS& z@>hp>)-TtXcA5}n0P_D^d0%bz%N-mqH8WoT0FU(lI|`t%m4!iA~>+1~KLBSUsr*Z=`5SyB!CM&A-q70L@o)1K4XxUoHOH<)6 zTcDzlcw$?pC@@ViW#D%OcPFaAQed^i_ zPUiuE>=-Z7RS{j_=NOC-zPhS6&1*1K+^-DwAEn&o@w{K5T%_`}@<2yf&~18F(4= zo_Imk64bA%46B7o+NIQiYM=+uS|~CEhU9@GDZCS*f^dHDb0RcC5Pp_k2PS-Z_p8eE zz~sP0D75EgTZX>$gQU6`NWVwU9KRePpl1sp(o6Z9r16V9O3J`j%DpP>Kow=mi!RV} zg1PhtQjY9m3wp1sx(=}f=>Um<{xAy|aVBKH+IuY%|SKJ`8$cc4fom>k6y=aEOKY-JJ^m=9q~dGRuP z$_QWAe-C4z=K^R%gmOBL)z{3Vq9mv+oaFzaU;&Xyr$|p#L@oH)@c#pn0M&%uL6uf2Cpx;sYYhVMtRseN{vl5NQO9V20lX1ilOJ+wXejf_p-wRS?#o z=Pn>}KNNWdfBFGL_CxJq%$2EuY2uJYpqz>_>;rTV^!yWww1OcyzaoR4kVmc zVAu;#DZQbG;C>Z^F>EX8o)Yxj@!&c8d@w+Ep{s&mu1p*U%{HyRfl*zMEGVx-9Dos0 zaO;$NYgI&qKN1-TlcT+0gm$Z*fNp_7uv~k&W_?n9)m~KuBj~vnh}6a3?R-YM!Q>bg zyzUR|m+ka>USuP?;DuYK-XDOU=RA@qj13k~W zA=|F*n^h6)pav)%<)ZTSi;~G!rirWW0$=du3X`k7;UAH`>OTXwLg_c9KsxK+8w-@< zI#elMRN>-hTUEc$QUa6*lpYABdI9JQ&dC;3wf?)Lx}dmVt=zq)7w2nU*I7xN@%IkH zO$m_Y!x(dUzIk{ag9=tAi7OjD?d$1TNK4EE<85x9sQ zT+YAkll$gF=o540sLAZ66xeP$PY>b{8r1cm1eW{qTXrw_pvV11JR)EMbe(_G*Y4KR z0Zy?GcpP1q$7C@zo#lgP-tdFdieW*NlhA)JV5huzQ5mWwxmR*K{4Z#hQqj3*uRwTGNjiEE7G z#a4dmh@kMK`G<+sU7%qmd&B=Hd*j8#pxmwe+qq!nz;`8w4L(r(_cAkB;gVO!&VkXT zVAIvPnSt))7o7pYn?Y%bPjl=Cn@bV56{MmjzZP7=F0QFTvw`^%cDj*i!xVc#n7jcF z5gP^@E!&f61l(ZR7t0YsaGg=kqrk!IqAr5`_YvWbZ(w84 zbe;)Bww%3cSSt$Yv%mSLp_ANqgi4)30F7tWp@^b8@R*;_^9?>!mw>~|UgCBh(yQzM z40&x$RO}i4K<2%dfLD5~yxpf7nlgStYIeEz5Jd!7e(+5j2ke?Gx%_?>qRR(O#V%Na zAE98YumH3>hCxA{ty(A}l-zun;n&L5-5EAh*G!LUuQGmd{_4h?7jX8)&Ax)KM(JK+ zw?&+yNTF=H;h0FgH(|9XcM;u|fow1L?u6i}@=L#YdbUHr+Y{;W+O` zS>ZG7<*uL3to!di7kE(KUC`D}{VI?NbFqA4^of${E#<{@5+OzW4RU7kIA|F0pN_=p6ZgM$;t4$8D)2xzn)^D1=b-38_ttra(B6xhv;{Ff!JnVF zHF*=nVroR;xhnOhC!mMR$e95fHH>EnR=YitYfCYc9)nvtdPvBlI#oQ=hc~6x@A&Q4 zQ^X;W_&MsLcZxU!KM}3W1kr&&`e6$E78LHD( zAjPjf;GzeP?RV{Gi4qQ`Fb7M0#~9-kH~j~-hs&#EP22ocg2hyPZTuTFXKlg44DOfZ zF@wI~bPEK66a20htr#I&u&!sCY?y%NEGudNzJH*)sLAt&)3<&1=tsE?N*{mq85&MP6Gvoq0i^kV*@qp)Jx4Z5wp_!&a9>)W)o zEMWvArb3ew_*F-q&ebF`kTCBnA-*tx=iipS?9sJJ#LywuJ>7)}oEn;$4dLQCm3X5K z+mH;_od<8dm}l@G3-=Q^?Pu4bfy4S=>)r`wu+j1J=r}rg)NHv3_3j?9cf@}2vQx>3 z6F?6$v_)_h`i9c&9(yrV8(<1)>OC+EZfU^!A>$mRA+XXLz6FwvdR8UW(iJF18>BV! zc%0Ztp&cO@GNkjSm57k=w~~R}NWcrw_*=GsVf|>j6oA)wJtMJPZ7rk`g8K12jy7WU zdc*pc%~cBZbnLH23a87Me^2qh=!#6>9y(UNi4Bl z%djAAVuvfIfWT-K%z)H;UzoLbq)_Fw8c6nF&t$fS?zj7ut6<-Q*EH}fO|JQV0XIh5 z+Yd!G2z*G%cOR0-6_-N0{i2`8nBZ;L4cYklDvCy7EPY=B5BMZv2y>wDbNoS_PU6wGAR&x{|YW^ zmmpCV)k}0T9`Gog=@uS2a108sAS{U?*qfeoz|oWra1Pon!{JPTGF`kTDG1+BK*(c0 zXpmR{{hX$x6diabu^al0i_h^G&hX3M7hlIJ@7+?1Gedf{J*d@YIz5f}qX~||{UmmB zH*iu6`rHwG*aj97!>{ z|3TQNxWp4)2~vH(;+61YYedB|(TX!d$h#4|yjUBTOEz&wb|Ir)v4_UTbWU%5>Lue` zsaVEi6`35Kc8_ZX+?>Vc^bsQvJ|Tc%4{fTTiwrZOlqxDseyP2im2bhl1Kt_A;r9H( z&G>R&3ipDzz1`k{G;Snf8v!<<3K+9*2M>=~Kd$$`kLPZD>R(kDPj&cfHjIJ#!MVUT zfGG8!n&^XDZrxDyMCx#ndTA2=8s43W{7FnAnkdCUtV=UK;LsqIb#}z*l;vn(xIou* zcUC-kRlSz(9?)(0_v#g#9~2%ho`K94MbYybiiX*-we>F*aZbN>wwPMzrwa?5WIz;=WqNj2rC&BZ3GDG(S8UZ#}rzJ5IapH~5XX8j; z3hUkFP*B1a_8`4v0a(uHVkOD>7g;_@?GSnkt0X4IADIjekxkO{H%w~?bgU;7tl-fz zj};_2N+&w5M2cl9PVs#FzAI3rWg-f!|LS9l^J(PXTYRpse{q%a^X7R0O0+6|^c7g# zTesK&l7xG*(AZ{_cV{T=!UuIT=tkkfycb_BSs zdT#^v{cgc&{;^JKjv2!$tZEQ92#&9?3PL%pyZhg2|5=mtXN^SrWp28qPW;b;c~-1a z3j5p3xxh`t63=>gyj;DO=9MPQ9tL*Y@JW2@iAi^_hvNBj0n>Cx^2J}M-F;MmmEcYL zm_xQrCR)n~IBLlzU%|n`)wsl8Wd;s2HrC?Vc&yTEgAWEz^S=4HwPAzL|DgUdqCtM} zW+>`F`&0i*t?wV*Mb*W-yY=&2@E6m8VUlM|nt|bv>4!Zi(cJPkAqDdYG}$1oisGZR zssUe8z3b+t%`itwE<_M(em3aG0$trQ!ytsm7z}jcE^vwE@ z^*=EQmEb4WmxZ$-44gOM%#8I8P736k0pvzIpF7KJrF`7q)$rA9h~>$E&}*qbO}5Uk zPHswib%q_v!ijPO8JmuNK|&Mnh1k!6KCyfPGk|4%hn+gF(pvkosu-UAfM_hJ-9irv zD1dha>D8A+VFQ?X=&m)TSrfZ{?1WQrL@q*xH^rw=!u3C;`3O;aYlnFgSMeFME0f0z zu3n2|Hs+=D3~$OL6ap2ZkHa!wb^d#{+jO1!eab_nb`)y%L0?s8)DmEDwTw6?z|r@p zzI_ciC({UBl$on4THvg=p=0g0M56Yq*0FO+>w~Ih~t>3g}$Dvfu4Iqz-GzrE3D~J4$z3f2fOX5;!4uQiQ-Z4IcOh&4|#j6 zLfKw_Rz&!v-cJtb?0f&9iT}J_2=H9;m=X&`eYXA(4i7;3-uwX)hd@|9#J+7EJzEk5#L^$DWwR!tZ;)tcS*o|Xxn)nW-D+*-VP6nh4YApojWMFZ(h8RXfSI)F*p7{ zi5EW37dB@VX9Dc#JT%%-X8`H6XS;NXZR-Noi8Qh9hbv`lt3B1xXQ!v6y_Ex&)ZZrs zrsacV(T@MSLCL?Gn{YVNC`gNi^StWY1;28VkL{(2itwcQ7Kc`5Gf)N&-)83{>%V0E zJAaYMsn3>dsZ~Qu!nB5ak!iTElxAk2=n#c%PnrV()bF2rwQF*^ftr8^#I&!N{9@tWA2V~clAX@NgZ}zGVG4S zU1?Ner1~rFEC5vJ_$18lr{{=C14KDK6mH_d8>s0wlm;mf!Lt%ic`hwv^1%Y;b6b2` zHb$n7zC|j;O3cE&&Rd#FOwLuy9)#tsHFHbSI}U#moO+$n`Z7FsenV%eXIQ`2hxxB#GZkV$Wt@7Gn zLl4aD#JGI*<2hiNcGMfQfR~l!&s>*DFFt7SeXr$r(!YAQ)>Hv&C&tOcc*t(kxI5C zJ6*94C-M1kE<@SRrwO08v(UG3e&uUoIaeanU^}b$?O}6KL>$SkZ4|FICF)n`6(paG zUb`OFwSQvgS?3z! z0W+lxKlHe-I}$D7w{Lm=yP_*&?)Cv_T}vsfRg_KYT+La|2#vDAsDh~e27xBRZ zjr2?H{(x~{`d5ud)Sp}IcvLy%*5{k=mxTP{OA%BTap9>Ya*@-)Oul@)RkF$mc`cbw zD&ygled~hplq!~2Lmo{dVvgcfHnbG}YV}NfgNu{VPFM(6Y|k@L_kvgX z4`xR7qdnE{uYKqN_5PS>Mx-YP$Cv+-CGTw_&fE$=ge1M1tH!3K=^jKMJ{s-WQz!QN zN(^9&xD?{5BN^6z&>2H9N<<71MgG9l@NrIwaF`hho$3y{sw}_7PilR4e*x&a=|OXC4?<$Uq#82{l_CqTzd+?e%3

z;&XZ}sVbN0K*lVcZPeDClr^=Zoo|@j{Y67~BfA-vf(-f=?BDG;-%>8~yL20&sOe~9 zEEc!jGhdVuKpREV$}wW-&R-k!Y|4K0jI6fDNT^UT|35DYw}4sqj}09=`}#@-cXBuM z+BhvLeq;age)LW5IEBXic<+W3;kR$RWJ}L{Cwvi2{S9wW?J%rMp^xM?%_7cLLKN3h zz7pR-C-fQ3`_H4sx8I^+A)>-#7hpm}B*E!Ll8Hxywj{GfbQp%GS$?EJiAna25AS8& zbc?-nj*GqUxsUGzoBcDu4s1!;6l3(i zdLZ4xxJ_`m)ceY_sTeJLSNl9sD`ODbNyI~{tW(`#19`+?|x%Vpj4vGlJ_ox z!c+P@Az9MRXqYx^BOA)Xj5q6|5S^T{!8X5+$|)qmr# zYJA3=93K^35^JAmNw1ZF$pHgSEo4{H3Zr2Q%EYQSd60$2e4z(k$eXw-xgomggwKzg zWtk-1aejaOGw}J`=O}0WoRR)|Gi@Z>o4?Y~XKmU8J6KVGHArXZhA7gk{(CNC_1u>E zT|jDmBVH$J{u|LBcPZr+gkOaoN3le$ooXaU{|cG^w0xICKW6*cK}I0&HdU{L&ATes zWfWD@CmT(-QxHJ?2CFQ%RXCkIcH3pQI4<+<~EMBiIyA}BL(m0*1EgxHuL1#V^+YlttX^#yB!y|ephDJ~6urm-?Q`k;l?%qFso)%!e4^*U*XgRZ z4G3PFHmcD|Ei?1uC{NMXk{sTiax5i-)E&c0^@p9&HmY|7ZN99BfhhTZ zC(dM>EZ+Kv)>s`8xyELi^bTsmzwPps9#D5f1a!?w?f7vpKE^!dKcr0DlK z0f1z|%n1$+KhUH*fvX^fIB=jist!V%VANu+acFF(%L&WUzr*@Zb@C&d686WfUgWG@ zTrkdD=%@YN0W-7EFF_*jrA&WAK^JLW0T?9u=Lumrv$o}Rqs86F;kOOJOS-cEsO=cL*Dy+2^Cs7yeV|kQNkN#C zOR2I*p*Qc7ox4XnN)nKUcG5X_0q85qTx2j*E-kF!(xFjiecuI9o0MAQOKv_05)X zK(Dhi0zxa<>J{17@o7UzDWWT2GI@qpLt8#V32C1H80h&Z;;+9!7dQ%EcENJ`14j$! z5CeQEQ6^CJoaWV{{1LfqS-;UQ)1Z&= z;Yi=(HAZpEcO5h*M80UidbEb~mwykR`FF+d@M$9#b5pyhyG;xFnZ|SQN%T=nm+n|u z9@BlMOVj3Gto6fi_ZaF7h;S!c^!2*Y+f)ZLR8%|~D_MDx2NEp`T9tlYx=4=lF6cm7 z9r!sb?_vTVpJ?B(JSJX?&B52$LDqgyLc{2li-*PWk1mCFfg*=4{%?>tp!|10S&7V5bR`FYEwo^AcE_Gq$30?hpC}fo-n@@`&46ftnxaa1^ zoSww_Vz9b~b)6?c(TPb7tg>TxZm$RDp-s87|NH=o5cq6gD_Sn>y&KzHrm(^dUqd5g zl989N*F7BybS@$?*IN&(nsL#OEjeM|t$UKwu4@!LSASw4EcDch(W;J~7>_xsCX)R! zG-+AP<<<6IU2XE8RvibthJMP;7@rn=7r}~0RnjQ@b43PYB`m_%RGd*+&Asn@@&7%1 zO}lfZ-K|FQpRDKLORhYRXYc2%`_j0n+WkIei!MpM249KhoOHzAZSE`ayjGwt&+u{Q zUMA5>;w^3$AZb?O+It$uz)h(P)rGS6J-{UBu}!WPLO%|@i;2iOepgYxi(%e2PLjrV z_wQ^c@pSMsxOeG@T3!3S_md1gPxCeWW@XqR!{XzKa7;wy>CL-;eyo6#?!lo7<4j%Mw^8@NhbZyLB>Uc?PEGUQYiLf)`;L3s96 zTb>(df;WAf*}4?;IWpT;MoYHaqTX~rco@AvmPA(;;!tD|jtn@33oyX15^7UWp_Sr4v$JLcd&I-5sY&QtMqfNGo+g_0gZioRjj{~gg z=NMP38NaB>`gjXxY@zKV);n?;1>HT(w?+1!kEt0Pq10|FTTqz2Cl}9Ws>FgCJ-wCX=JZu-ECMy4uxW%dpWG zKQ}VxwHyZ3o27KOll@<<%73|L{xz)Tbdj@GetKEH$MdCHA2*9K`3MKqx)OPuV)^iu z^&22=;(E0*)uO-vZhY}gi1cUc39*oYGOaGLt&05JOy6fW}cby65b)#Ca-< zYU0@7S3F_|VuRUE5FUBT_c^zE7#Vfg7uCvd4qVAko+sLMN2}h+&z~f#n*`db3~W0c zVcch_r&?aYQV8Cnrq3S|sZ7lkecu=gP`i!DEZ*c=uj-?5b!%PFQdj#)dO8@|UNz_T zEaL#C1e$e$5ok>npJZDzKZ(?Mrl8`0eC-94sdKtpg#G^McHb)RCD9~@D}CttD|ZSG z-~H%wwG~+IH-9yNS6LbCjtq{a!{;P-Kx_E>r;A$*HIk=oXo`&ZN(j2zHp_{fb0+UKRl<{O&>0IF!e4OQKH>)pxoM4y{phWVOyEtINvq9~3glcVDp< zn~dD>moH+PWKM2d<_mfGN&ZtLp!W&5&&?lFE6^f<9iei2?T>~aMoTa2TW1!sVr1_1 z8WJ($UUhv4t;7;Zk$+7Z^?lV&2*4OrwOk;3i^`JW{oZbpLQbD|tF;^-?y#X>qEIay z>E{AvoCzVSHl`#+(K@5neVZZH_JyecV>`($a7abxHH<(}cZ$14vv?CjGsskIgv+&X zT4E~C?`d<1<4G&;$eNjl)^aWCpcyb(kE@BCT6krdD$%%HCwC!hkhYh1aw9%3M9cHm zH28`)#xPhoBE)9i5GDbK{72j~6NM?|#FJBJFU zLipSVn75cJbFUQ{#mw^Y$ICk_AR(h#X(hr%;5`UB5y{EP$zC2or#~Rvy~%cLWpClK zDY{+Rs=mF8oUhkp=D}@xR5J89C^nVsHL{{rmWZ{|&I7eT6Bm_@yQ(;{fc|?5&gH(H zrJ_c~4A)o6M?h$qR2}t$1uCna8EpIcCu)8%nA};WszU-a~Khu84L$36lWx z@wVOdz*=>MWaJ7|S|{Vjt$3KZMXj+(rXGi}Gl^@k@hJA(WsI#75HMZrkWS&aCBGy0 zI|cl@A9(kSz#)@h6T-a8k3?nm%%A#1)PBkZ-ID-vodis#Oawz9kMt|o)VNy?h4-f!j5DDV{FpI)= zL|Q5FoVvKT+)Q!pYCYIP8>_4LXky2M%e3dM?Nuq2Qh)7ePTx9U{KLm8C%hME&bt(A zTiXW9Z50(WwN3M%#EEw44(%H)p|Tj4?$_2Cf@OlykYD!?-f{e-_fKC?TS|S&VA0n| z=ttu$%Sa#uZh&HhRAwsbuqNjQ9|iKXBGZ9#_dmqDcL5 zAK9$ypI>@X5xeXJ9E1ofBRSFnw!-BgdTE0a`7&# zHm*<9W(bs`Iyci8*e*u(Gf%Me|G7-Vww9q0R=xZ{w6;E)V@?-f5XC?yy4L-CqMKz2 zMii}7bN2khZg|Bv3GY~AV+ji<-#2qrW%(2zaRc4=YEbyJ73`|8ElIlAkGYFdJi9G) z{+I&tQQBzL@(%EyLtuFAiLuhZLk`aH(ZxG2wL zhy2~6J51;_@W&SUz-{~Mmps$#^)A7FWh&G9-M?)aQSXi-jSM7O?n%crz^J0K1tlj* zoBf183YxhY+x18Lg}9}L^4g6SF<3jryVMcpUGLJ;xK-r%t7S@t6 zEV!XtvPs9J$)9~qwoP?bM&8aSyd#o=VUc zg5`#Sx-1RZ!?_ipfCHS;!Xy&2cU!#qfM2@A1qHAXVH4rHY9Fmmg^afnP=B7JN_DK_ zimXqs7VbF5OyFO$GbPj=FgFkMY(DCdLAKCE{4x~<8}fbt{YVAZ8MJgSF0A~>Gz@PwkymZ^H$@1I|y;9B(VCzcq#sNteQW)xBE@JWY4YMHwG6Q;QJb6arOz|h&95h|g0 z4fh*tGdP;LAdCs|SIggmcwS>Xod8RiN?y*_#W{i}$Z5rbno=~!+fT#ZZ=Wh;4!cSU z+lc)(;;9w)bZyIo>1=j$c%6p6g_k8iY@AlyYi>)*oWC@OA^9Is+a<0TNIS^hr)Jrq zu#%%1e6jFe?Xz|C^It%@z@xmB`7F_2`P7>bEHy!Ri$}$3$<&DaEJu^sSWg<`)O&Z=+{R_XXEzvcl=(Y;* zy#W5@mH&sBQZhh2oavjjDYtlLNgu8n*_u1v0sbMS;_{{{#|P8KcN`lu@@XyZF^)14 z)G*=~@^=i%3_nYS&b&wE4&!7(VuJJu&nbJMHxwy@<=?+;B`TS=fxN2dP zZ#sQ;Y^$vBX%H^~41q8?wA1!{vdX9JYqb;!t(N|l7_zPwg;%uVCpwdK{Xi5rd}Xd? zSlIn8GF^_k>r`YSd?Pv6_9OM(lFwe+0w{u!-K{6HRyb%1?_&#&mAkVtY|67;Fvp9+ zcR}?{LDq#jX9qSQHPz;b1G|xumr3vzq4ZcFcBTY7bKm_#E^J=(XJFyF-=&&1ziNK< zFhKvY7HZ98)&~o3V;p^>{R#VdpyM_!nsuFx#Fu+3EizH66a>|HVv8%VkjkM-dP%4m z>I_q3lS~&y-$jYnKfY95$C-Zk@3abs@0so*Q>JX(+-Id@5T%AGbT{=ifv91j=FwL- z$E&*k^F7I-31A_pM7DhOG-u`4W}x(TaEw)%V2 zL9UqbCRDlYW&ZMn6s+D)NcUI0geyY6ik7){5ZjJ^72NM@O=lAvH$s=os1!Apa)S<2 zpVBf~Cci%B{F~8H^fs(a`vrwGmO$Id=Pky?M+5WnARGtHZj(DfS0#g7!@`IqAg?On z&0}O)kHx_p^=@mSRzNqyMa9)n@JE)GItQ(NOkdH7L{(0#cx_x}q49FJ0Tgud{ktyO zi}hGZtKS*CwWt)r-Jv@(MajBznr93TY1iD8Z5TsO(hrx3Q|}~j9qh0A=-wU#Kc>=5 z4{qrl8i-bqAcF|xB{hD?nFG(pWf--CL{Z!fXu_>NK^mi`C%y?KTeng)7qvx7O%h{~ zjB!^{<=mR&+41o44Sm2Jkf_jXU+#G%m6#u(91j(oSyoQ`8co>KBxbF?2zDE^)O$vU zZlO57@9qAqVJDB`F_UQA5f_5PJyO%LQ7Q+bd1Nee$63T&r;-*Qfb39kh@J(4I@)lU zi9`!-DCC{*ZtX%=4YSBa3Jc!WnzQVE-c(*BX*_877V}opLoC7&C6Rg>Cz|VFx?Ues zqu2cBW5C#4yC9>N_Z=*C^P4xc*7C&4eFhS&{Fd6sHEnk3p>}pVMKev6WCp84RHn9a z8GenAz=}f_It{k0JGM*u6)LR1iut3+>L|Eo-%fqoqfMM?@lz+k%7-j&(Yp;_aL0^L z`6(I*A{gJD887X*K{4RheH+csJh+57ncC;sW$^G~BGH&;msG)xjw^ig&)Q-{+f&bp z@q&`-)?etcQ5Mc_b)bqK>~1%B&HnlRoUT-5SJ!pMvoFli`>7P~)^vfq_LJvrbsUy8 z5`OMKKj@;H6*GRj^wzVV`Rcr9a6V*lb&1K&ZRGR893!h_@EpEdU zTtaddo_W|DviccCi%KzuKjRr<-7}^ts*FwS;H>S6Usl5r|CMY zH!0Ecb!rWnGgy)+sHiFJLeSN&JT4r6um~sv^_vm-tTYM+sKRQ>76KL^|i$>KO(9V9kxlXrUlPyEX ztZuYWM!G77SV(gz1(l=43sOgrlvr!fQ~gN2sfR6qszqq_$&k2o%vR)5vA9bH`cMLyS(U&XNY7vE-o=}tx4M@V7m+?yB6Zt`%W%XeBIL^ zLqnVF0+q@r+FX++#lZv*@cotRUzGG;YqYP40-U;pY;qai-+CkrElxS#3OZiITFH6| zu;g>9AI{b!oGuF}{!tRZP&Wl9ka9#h|AGYGO3Rs#`mpGwiqB=;{Bk*HWa_nc2`ZEs z<6(VxyK44@d4%Fi;P3+0X9KEs>#MS^?2M-AC_kN_+cF#HZ`s{!_K4b#{cVuLPRVor z(?aZTzo59o_<_g(*a37)VP$t+pQ!(Vo*qey9sC;gd7NwmMfbG-8~4*6>K9+<|4BYX z4u@dp8x2LnwHTgW&q37b&4ck%=ltbdX4g`hT1H1nDcXXHwt!bnhy`oxY--e=1N>x` zKR7%h$PjaDGrC|`^-4Q=R|fcfX_9JP*j(9l=jA2CPa0Xw*)OjojJ~lYj@LEeH48W( zkLQ-xA4m;%B3cCFxwbE?fKtDyQU{v~DbtMtB+c=f@zAYpR%G<_95?U`84l=!HXJIP zy00R`8xI7WGe8}m_Tq8tsxewRMPJ)uuMT&{>NX@aCf3F_43jWA)+fv62qa03EHh}Q zRjDz=<9@#>Yf7&adt*G~t`lEQnHngBaU<1w%bnq-i~zBotCGez5JsIewc1y8j-`nA zER0hlQf20E@Lhd$QEInc6JpWx|9l{-+XKN#e_nf@6jXLK@Zym&xWzs?T&I%7p}n;& zYw1o_ZTZfI&2caVddVu(!uTxG3erY7^A2;iZ&u|PqWc_tdD53|D&f4hkr-artPk^psQtZM>Z} zwCiu$^Zjjv3Eo<#2@!Mp4NOF7xC8Hh47+uk^`C{GC*SpctaycSn|pWihUzMmInuKY zoAftU^!x*$+=Qx8Oe9tVHJY|G62Wzt%B1PtVt9#d;`^RA>mZ9USNXYJA_>f+CkwB% zP%G*Bc;93$)_TJt%Nmqo^sA(!b1!h^RnYyvk2a^#u^Q~PD7kLQ*s8@!bBUL}=BqhB z$in`dYu42Z3FY$>OJqFGPv{t9;rZGigK4GpX*xP9>O<5!Zk-WyFtA5et5 z+vgKiPIZHl?3=&Z`3h9JJws6h#b=WOsUf}JtAVte$AbbjXg0P9#LieqUs2sW8ZRdy z6Ee^&&y$0H``fU{QH8FiXa_@rjvwgryq9Nq&6As7lo~$Z_LJ49M`*i*LdK#4*z*(C z&0YD<{86#KmFNoxWBg$FfVR8ItGIlNs#q^Kr##2)a#}o26;m7RbP)mboDo~KUsF)E z{EVhIlfj^nBGz4ZcH0oO@JzBgv=$Q|pX%bJTP;)yKJTu0d8ru#GtZ(mb3 z*jCOL_{5F#dREAlOf9O;r7F8$puRxZaMo=9n{_=~#J?98s}NVCkn)8M<97drOq}Wz zr$eEAn#V+vFKTXOSTP8cmWt`U?}R9|ViXXx;%XFE4ISXuUcYNbUui)Kd>O>Pt|w5} zWoj3WhYr;LEm0+1Tq5n`KjP3Vi%{nHrHm*=6Urn$Q2=>P(5-mURiT#oX&M+A0p(WE z946$yPGHAKSUiwiKnq1d3{4}ZdUbaIgKSx0GFvX}BQRTvuIPhvd( zFK~k1sHE>*RAy%Cfeox!n?y_ge`T(F}D z=8T#%K5nNcYW%@VEpZutK8jH_JnyZie+f{G*_CT%JliOr`JE}$25N2-m zchjbal(04l-#?EyUodZk$+y`~HNB7EWQE_63kIEukZ#pbM~-JOsMw<~+z}S`V$d0J z01`!lQceF4Gj+?mYCW#aMCGEZQNk_%8nX~EwOTssYYbtM&d#oGV*S24IICuIt8r$K zcqJq2qDzuxRy3u`UcRf+q4HRg+rhVwrZa(>G+dAUAY*39a51Do;wif9e4*c9X?jwE zGMXqik@9nWyL|84-K|ps0J6299G-YzDC~ogSky%QF)$YYu!7=qa!|^S zjF_;Vi~b}NtIM2Y(jAMXRMBgpD0z!xJz3k{hsOtYU(LzBGi}3x1F9~}DvvUJtd_3$ z4w5!cUYzkIfBBuu6e*LMl_9`1>WQw~Rd^U)oXEURHJoeXP|wX|K=4~^PIai=iL-C7 zSLHi_Ah%ZN`{IHY!AO^DQX4*F6oUQNc>i=9xnM6fD5@ltRQJ)3J<(q|31l!n4iixL zw5ICc1*KyjpeypH8uf%tLdT#3?GysWS978e$d|9WX!04Ta(tRrn`^s{1*S1Bju&?) z${yz`xI^?e3o7jHqoXRNy+c%!Z+34L<5S5tX@?s2PSF0G#zXmmbLwCN0PcJFyljFc zxwXm}d*;$?&?dUX)9}$2@P_Dqf8E12X0YU?CN?7Fp5)>#{{wIN9i~nM7HOH2-chC; z9EUB`(TP9d5a8|D`V0x7|2G={?Eq?R)m=qE9vy9w{^Oqu*zYGD5iVtzxQIGrRNv1zSL*2}VUZiwj;^%nx8e|;`>m~}!S|Y5% z)gw(~=Cvv2d~gySZ*Y41ahz6E;(Bma)>JI5-Ey{gdiA=Dgqx=+lw&`j-G(jQca^7d zKe8VJE|Ib)#T+^KU6a&|Iiw?ft_G=bfKbvMW+yX`Zq4)tC6_-7y-RBQ$O!*VLEQov z-#lXOQ^>+EaqBgS8ETro59H((rBPoe zs6&XoRI7nd{TENPIU2h>MuvI#;e7zckgsOEFhn*1mp`FT10^V zaSaZ(U@pk42r;qVBCchc(T%yX3^GitECL3m?OEAQHjiWGpVCDI_hPpUo#Y~fi9 z$!q^De=;C2X*=>S)e9&E@5VJba?X)kxp(-sD*@&0?n&{+K)bN<4!eoY8UF-JCF*32 zGy6rLusn@Fi7nIjhaXc%Z3;FVkXrtJx_&_fi(92DCwD(PM`Kk*m?3e+q=+q;HLM6K z@U(_PfcQBPKcudfaS~6g+1`37FXl@(9zJXo@O*6c=4bAVe81luh8Q-(V6HNJFn2iO zNIK(mB0K<`Bfc_@RXW(hV!BGxdBk5OMVPq{{X2})&RU~J+qw%7J8ebpIU+kE=o34( zGrZEgr$G!94o?wT#IwCSr6Ku|huN>+NYyX7kO*uoS z&v49(CGevv^rZq$g}Uy3t>6%*tF4ixrH?(xWM$}0WI4pxI zOsY5#*yS)!aG9|N0KL8PRPb_ang<#tNbF=A= zV3aMP_}pgmx(1Pxf#?f0))Dey2e-aKz$ZR%E}Xsr^D5SI{<#;3-E6)1K^V?WkluqG zCeF~ESwfmc?vcBWY+hvne13{oILJt@pPOJJCyXd@scl$K1~|XfFiKXtWfu-MwX>x+ zaz@!7Y-a!MQ`?K2q_&Z2Tea7VtyjAL84DA9d6w56cmyRb`P%#SP`njCcd$0vHxj2e zR=QkYU$Uvp8j0rZJQy0^pztz7tpof2fV$+u8cSN1y3Q83bcB7Es70k*fw2wyEiFC< zE7Y4`*keK$)CAk5;L=jbJ-n)jkjAh1415uTC)cby+q2n_rZJ*5YnwFSs z{tu^!uMr345Y%jHmH@bnOnX3yo005@iNE7(CmV~h_=tSTEOC5|_ytBbaNwoI>D!mD z?4N+%$r3kd%*J&iXc$%0@tlHw9TK1&1Aczb<1FmIvj=MpH27|hBv)w}LG*TE=tceL zvsZtCxog+x^NL%S!wSN%{B#@f1tfCY{M$*E&{OVuzz4Os{v1j&MbM>H2YX{lbIX>s zF~c&p{OTN}`x;S`k6`~Z%b-o(4gfn6!z=k$6Y2(OK|B5y@r`QcL|#56<9Jm8o`cIBVDn-Jd(Ofe(>pAd zP-Utx0yQ{yg-v9+gJZ~&JD(xz%t4RFcB#HqMSNzxQUs~YzoyLHvx_y?*8$kbKxQNP z9ua_8We{Q^I(V&SGluu58y>AwqcQ>R8UCP2_CEke9n4W}1ny-2KQH_1IskheRxGVz zzyUBb2ib3H028>i-Ew$%fCjGwrrKgq>cHM`no4MTb5~bpRK0Erfo8YSu^ofG3roUs z6?8WM#Ap*_#M}sMdjUn>dgJ*^&T?&B-h^*&G4>55a7DT4LMgxE76j$zGT4VTizpS` zfP$l1wY-!xntWduaWi9qMvEj8mCY5Nj>ER0Bp%2va}LZ15>WxF;l(E9MgRz_!>FGv zCzM{TxHu&~L>W$Hq?M4eQjn6a%Il^LX4dng+HY?1xamCt%xz$A$R%V767}8;L6v)K09ieAB0Gw8#2gaocSVia_l(?gq(OZ4Gvn*|~grlA6w}|f% zmxzamANa48l`6MO;ky5lGOC3C7qobl=_SiGhEm<)=1M z`cJ%SFkeTH;D}Y0MFj_-qXMkR*e7Q&w+i0G`t-Q*t4$csY|_W=0ErvP41V|adVmtg z6b5C|`yUCE1C-`7^BqSU3Nlh?zZuZT-#EgYsQF2o7p5?M7X<76y%(miPGlXXbpQAE zPE##jx6GI1G}9Eg$5EV@CBK@|$1h;-ZV9Vy_H%~tDlq-lW=doRCR8q5d8^>gpv~QlbZ*-kW?o3AHb51d3+yPMOB5g92*(1KydI^CC3@*{?{&Ell2#9 zIq|NH$A~|e_)fj7Co-^&BB~N}QTI-bT~YUJcX4&bKjoyUJ1A@sCn^an&ffsNDe3w% zsOkuAs%;Fzf<;yhtv2CC1^QA=UT1l+a%nC$N9ylb>>W_xxE!;Z(%9O%J9|W#_=Mjm zDby_-@I=%TbCYAbM<7d&{zGJX`kAZ{f1=o#a}p#(dXD%jUKYJg<4UfjYTTJ4U2&zf z4y22p*ejvAk;ln-*-t9L&=M-)E{zNM$f;`34@81MMZjN{x1+jE1E2k3rL?k=Ze&7XIJYnjn=~J?f%mMAbZI8&-d27Drr#VKjTPe zrd}nZ5`0iPqFA+XKqV5t@AMPi6TvDrCT#V$&+v$67IvR(3i60XXF~;)nZ=YWePbu7 z!WepU01E`7dga8e<~C5G$334mi_r6Dpo9K_lFGrFDCVt_E@$CDT1K4=mJv@&F0Qcm zTBjLM8F3f!iHJXQe!oD6$dalhCQ29oimxhh+7tXiXH|i6{XT=q&bkCIEg(=BptWF< z66gmA@H@_s;G;}!XPdRHLTQ%%QPl$pG&(y*i=FF?=PM$9z$N8&5C)^L7B!?EXj<87 zk5>9P;G28f(Zy%p!UyvVSEFWgc&WW%gURjZM4B^GD0|CkJCeAH{31<%IyBfL=eMx{ zoLQGi{~PubaF(t8`QDm|YZiNP3!<ByTd#kQ)aiiiNK+;z@aVz~3scYZ?kCC(B~R2EuzX1RfRvp)8)6Ex-yP|vr?Oi$ z0Ga5WeOubH%)t3J2d3*vN}gLav@uEE-Fg(n9wAO)S=eAHOqQUo^=H1NgBgt&`;7Lu zeZ(Ik{&>0LOZ5&{E`J5u;nZ^6Yss6B>3is=_ zsrt~EU`7;_1lcXNxY3C9DdI;!V9!u*VaE<Qls9#E+Dr25w4T@oG3_@yRUo zTQVOb4oC+T72f@LLCYUh8jZ$!^KKt$00;9&s!XacD2xehk_g^_>jr3}!DN_6grQIG zO|uwG@F4lPR7eH);vCuMhjjf)MxA_(2m_%S4by;cHzXjz`SLd04qcS` z9|vl^uSp%QA~Ju6s(HZ>1-V+l^)C^>Bs6qksm*K%Pxk$h2Cq8~)wUA(d&Dz_P@1Su z9&a!hbz46LprgHewg!?-tWJ)Z@7Ay6P*@W55cybeBHjq#+d@1kMr+G7atq8O%7QQ} zxE-FB1o2=7lbbxVWrME-*%dww{V<9S;3(_Am&XwnRRc4wF}Rd{2K9R1UA1nons1JItV}g zhxv!TLyUgr0>j#VCj9QiQg_7Ak{hhhf&Yo23yUIj7V(LT=ph3HctBwC#2}lrh%K#k z;An;ST6;IgyZp&Z#2=Z5FD=n%52Y0GlCX%GwSeuJ;uJ=gU(nTvl&#kGpL6Ag47j&C z`uulx+|+gk?57~-7Dj&b8u2p?qTfA$DlZby$qg%wczOuAri4(kl0M+dz#G24i!X4W;_N8D2t7(IZfjvb~M+Leeq>g-7;%xxfpdQEyNR(S#O^Kqqi!1bZsEc$WpcqtbS^#*4v5 ziB1qex#*&qaEwf_`lig zA0nO-93}DGT%hX>@#MB`t|H6aflkDv3e-13WLyl5UVt$hb3nq9*ehqYGHqyBHN4y0 zyux;=*g-O|M+rk55=iOOyBO5a0O9WLCGDAg0iojqh+<_7G?umrv+ktrc#=g#`K zO&cCf#4PH+G!Cxe=Qd<;eg?uycJoa>^LoOJ-r@y_6$f5fEBjz?NGzinYyw_C%u5L{YlzZpn?N6qf1}JK#nJbogk%b0os1DjZ3zJ~9QkA*$5`laB4& zSpSYHQCJ=z+Mr|?3wwWGuEmoINn*?cuvFAYWyn193@o#}%+&}Zaj#&~b&eHmq`wW7 z;(6QeA|7!%!e(mm*%qOT#y&%}sO>TXR@O{&xOxCkaBxQxxZWPqtd_uXas9Jp;2G!a zN=TV^`yQt4e(i(jFMP#|bsU73h_Bq#BjGir@x4~?iEO5_+`C{c@9kYLk^F>2r~NYV zpPNWNM|{N67AB~R*ER%K8B|y>A4t?w3OBRkG1v`fY_*ceRKm4^=4Fxv!v%Uu;P*?% zdi&eCb$gW@^xEomR~9|PJr~IZbNhe)#hKOqXhzz<=L2($w&ESj6Y)+>^cU*o?b-qM z1n3qK{+R)@&JJ7dzJ*XVpMh zeKj4odgA9}u+VW&hWbucsV~W$IvP;5`5VuFW@x^cuaTPKH`tkqWHrqrrNa1TQ$vM% z=BK;I9G+pM7T?b-IO^F-&U4N29un-NB@hb}HpJ9Nab)~?y@^3C5x+$I2t_CS_0=vE{OR>?-}fwI|ts;JBUGWv{P{zQ;NS07VmonlHmiz31;R1v8IsnRkdz&CJ zm<+Zy>`#O8F~bzMR-> z-;;q#3HgQ~e}Ih7t*>ZX1301BRj+7?g_;hDs(d9R4w zLy?xoRjXG+0O1U$_yCs69;Uue?9W9qv;8E5x=tmNu^~l8w!es ze~{?0w8q#YKhbUm00r6!a}sAZd+(D8QPLQo8dQq{LaI)6Lk#ee!+RObz1{W*f*vfb z&xr)-=8yRIg9$CQe+O2y#IY_~+Lc|mz@-+aIbjijw5j`z05B0E=h9P&FMZ_xRTW5G zh+~}*SJM*om7J*#o_Yb1msb$;-pHSvVAz`w>etUL00;!g7$)YohZIvoq)Xg@;0aS{SW-`$$H0CKF)FA-I_)f_80J-L(dBhpgYL3 zVN*t`BIwQv&l%a~?EQY^b`F~e^bqj_eFU$drhqgJ4&B7i)&+qWHv7jc`gimvgZB;0J@L($h+8b5Yq+uMBofX z%`D0&1F21pGc0fkJ0yv{35&OB{39|*L2g}H)y!=ZR0b#&P-}v}ZaE6!w7&kd(o793 zN=aXo>Z7mgPTUN$lTI#7vT=c=Z?N2rUb5)Jlb^Lv!UY=2+Onb<;+9jemL(eoRs>U0SQ98P1hqnul)$hrXf+h2qR=&%;PTgBQvNbu{|u#B@(jJb*zNAU6!N-!6{-3+ zW~XtD_)7S^Pev(7tC>L|<;!@jMc#AB7e%}))?Zi#F>vz@k*w!GZM5>M7v}*>-W#Q? zBtR>*Sbt^MBe`TpV9Gdd1sv>U9Y7|_@SSnYNyODm9jit8bzvHcSb@kzv_oN2W;eX2 zfYTj0bB@zi+`MwOR9qkY`!3;8r@|gs+S?NDHVI_^q0GFeNQpV?-BF_6Py#9gsjPc; z!kOp%+<=U7LBXj2b|7@O{Ozj^PRS$w6!FXY^vgMCtbEJc)xy}N5lZ@J55P)4o_H;h z%S|O`J8;sAwH9e$UdNsS;AjeniZd%c?7K?hE0e#x8)IZU)mpc)c_kYjyist#qwLKu ziPjjC#ZwP{W>d^}+*HW|iw8+>zFQvE<8lIVZp^^8ezCGu2>$gA9KYj4b|8^H`H7r$ zzB3Gx;WK3gZSqq4TL8%6?UkWi`PZ2VZ79g731D%CC7G|5J@q|aNeu<u(dyodpat)hcJ{ci?68L>8f;Y-tEUEq!UjU@jeV0AVc~-_%8W;7nQE87A=Qb#{Uawz^erU2+jwtj8@T8_VyvRVM4?ZsYr$#uh-qf$}X29y>C}>8V z*XhhIRNO_{4Bx}tw4~{KkW2xAteUo5)#Bn_bn+eXK9KncJ+5CS62?3=ocWBGtmcNG zw-J19M_tMO1yr^tn?Kq8E@}W5s$UNgfy`cwBUe!2_?`r7Enar=tt3BgSy zh|kIOXa3_e$zii`Fr9J#Y&&wd#k5DE$@Gtcn}pH)tI_Cz!OQ~W!W|})0GV|xHdaaOSwWX zneoXxgT)Of^_slU+CM!-)WBdGu-Dw7S-;l^r~-~;Z!N*Ru%AAg5_50>A(gXo{@hvv zB@&$LlYg}t01C3^BcWe$qu`nq4)}+p6{SGKc_g8&$=6iV4a865+JTPjc4HJ)ZA6}) zMWw2g!=4Cz6=pjUx;>T4EwfmBWI}smt6LT&gM{cg7pV$r!fE$}_!`%}gY#CdnPRxf zfewb}f};Th?7sTc%*A*avr%N?iSU~1D}FMf^q67iKz;NIBk3l6F$`Y#Ho zWpWz>{_#2D&k=dV57w1dA<-Wb95(Ug&eV$?<=X@6(OF)5OJPI+jyQnM3Lo>1TXzb z#dUTp>Y2%uxnF;z>Mq*yg?g0AMVXPz=Y++Z|HiCge6UzoMtlJJG{&y>n*q3_dRB09 zN&sPPdfs-eWyuMeAB_>|0U9IQel&%5Y6AO+-;n`#9KOP`_H5bTk{Z8YvbPLek%EX+ z>kYjvDA-qg_*}34IpPi0qQJWLpy36Pl%X}=<9e@47BoZQ-54Z{Wc`j2e-ZHs#1K%0 zLC`N@L^nu)!3MwVu#kd$FOPUbx%jvl04}tFH>_m~?It`}K#d)J^B0}KVoQQ=;)*hN_*-7WaUbWpANuzp2CRp=~7S?=9-88g$|A z7Ku=1@v^s4c#b{2GE7SiP74;K!#N#rT{Y6V3^etOSKO~T=*grK8921E!dY-HgS{tU z5x{b@uP^~_hq;^y33)P)T$#cLR$F9#7x9hFD=El+-b~@cnCJ@EzXwyDj1qk_;l;uM zh?vSTSCv~vrlRGn%Ys91=~6F%E4lGExzRpD#J}|(1yn8d$@04*U^9ce3kTdUOOE=9 z+`y_5pA8ms-k!gTc#Zf6VC$rtEy*4Fh^L6Jn;G!5g%{;d8#FFEUbFQkA1*tuQL&Bu ze}*3P&6?60*tLZT)VwL>nV~?cwX~?aYEy}9XYd+7abkAsj^|PV>6DChoj{g3t2J|u zmC+_5ug-QfZs$i5qzRI;A_kRQAYaaVPgPhu-lyAGxXXx#iXj~t^tk^1Cj>)ph!q~t zjFqg8^3z2PRds_5pxsk|%5Z94p1L{j(hpz;^Kqb{&{1y~81&fKP>K`*OFdN)zEn^a zIKGf6Z}b6#L9-HbEm3pt@w*#sygHDDDQ61);lu^&h#w=eh`&l+D?8x>gPB8>E$JoV z8)cz+OFrAY-(UZp(wqx+v$Hp#s)mr+yPFpumFX(}zvd*j=GD&Y=#!DZPuThnmvX{l z8ymE6UcZy0ph^=axR`O&nPGP7^w|b+BQ$tt9B7Y4ym2GfP2sR!);3u&k}b&Cq}hSp;VTPk;Zm+p zB}=HV;#$3t2{k`;2@5F-ZO5@KB7TVYz~T9Z^m&BcOdAR3^^~@; zSypcj&?47O@=ZUXEzR)HsvBI;o{ThBFOGmxqrx<}86Krag)|_cS}b91L!uAtr8*5n z0S<&d=Qoz4AZBRr$V&DshEgcmW7-HZTjpFrd4@^f6H=ZKX5E=kctZ#hK(0L?NP57E z(ZJEGGA5VYqxC;; zBG}%Jm86SLDBJHbq6~w(AOb@ZY4d zE^uR+5oAGeZc0>0?tu1aO$TC_%&y{?HERl{qucTK5%(BU;KP+uk4Rk&vKgqt=&1pq*Kd}j+LI_;kAEU~oo2z4e~ z3OrZP!qrgOZ{GRjGu=^@v2}kbMK3Hd< zX1ju|pZM~G(f0`ml<8*Ha$lKds09t}BYr|sn0^G4GGe@9X9e~7#*C9x11Y&>nAAk% z(Fa}>Tf;N=ZUYlh?k(a2hc0Zn=jE9H;bpTGKP=oFEaC0h&po;H%&t7HaiHX;9l)os zH|T`jbGBBJs-j=mzh<1|hJ>Oph!*SnpKy@{-v0s`z5@!?n?=Cf%&W44-dlL+O~TbA z2)e-CXDt5C+)WUPU&&)P7QZzIP~k4_uso-+i?s?RQQgmC?V4hQ&b?|2i^C(f^h3mt z7Hef%HArgOtlS&tP*kqhlode3HKXEaQ?*)Mt>I;c>ngj`eWqhr|e$GtgzqdBn<7OUJAOo*WQhl+g zHlpVEYkN2C?fwU*40yO#XHn0ShgJL+pVA~3J3EmXpn1RC=>>%=_4b>U2;PzpNIj4 zU0}e@HoCaNoJM8;5S(gxrIwrBD8x%%BArHImD$ONtTYi+ht5Z2KUY*djm7J^S?c$| z^~C{hm_L%4(>_FeU=BiQZ?oA*A3959mHs3$#14sg%EF9BHC?F`spl%=vYAEe^po3a z_10Uh&X*P$WV^0oW(L`VukC>KxdHz($0+kF?~t7Z`G&ZdD+h4kw_>w`nz8ctMq%!- znbGv1qaDv*KfSp%74EK9)9*Q2CmY!v$@3rZH3byX8@iUv5P0+6D2M}B3zN(C24$6> z9uo|u09*iO=GFQE-`d!yfHsjdX2=|B8)4HD>slM|^TvsImq-b#b#W>gp^^`1;vryzywOBJi9reWnrTV!%a|`oQv>ll*VOR===0 zD(ERAn%KPy$TH=i_e9_~xV8(s)Ud+a>b+poUg^87toAe;{SSsi+ckeA1Tz5nwBxze zElL|_v#?=-J<$uVXnv$Y>%o+Fn?=aU!f45A)wX6M{%lyQc?T9bXQRX3=hqrLl&u3u zn7KHOpTQzs!0tQvn~l?}19-8i0X6HlHz8HK=0FPqWxhmnA|C~dPFx`E#N9zLD?VS) z4Sxd<+^zXLzyFRJzbYnLni7sq_;m5fY=>rs@h4h3p1sgx#P1Rz zjc(ye01PIul=zj}L;&sD{?r2ZVcAHSTP5VxZe&A4>xLxg)7wb9poX*62QXjd4*U85 z%N(4i0(-ie7Gsfu#Bz*&Zrvpp_TY>#FWX$e*7C12BUC?HEBt9C(o8o%twxop7)m3c zXCcquPQq`76!)G?Mdhmzfz!u`2NPE!3_ne z4H2|(b!O7j9D7tjv0Ja75YTAmHWKkOc6$%LXx^i+ddeMpB7RMBawW@?QYKZxWf*P1 zWr*z<(YKFm0D) z+cWl1Q@>S2mw)Sdo!HJM56{5uY(>0Se}ArbUa`myzpbb`BkR8t@ux(Gpb)hO-yCd! zVqs6JPQ8srGH)@RH%!{bj4y2DUcr(Ma0h2BXS8e}C-K;*q4W{oBmUipBX8!FT(q$9 z2RkIK=k+NR-ajIKt2r~7)dn*JqZ=5aFz1o6THZV5k#ytXh3@47X6CEr7G|dB<-LIv zUamXyo!$<3u_m8NzlV9nAMKsABj>RIvUMD zBR&yb&*-&FE2>ZHIRgG9dm}nau2WeeT-zB;kKSv9~< zh{@h+{wr1(=6fW)G9{C`7va03HOlQmBKwjhHIAx`T=X|yO+(2S9ttkr5ih2hv~RhLIvdT zJxZ#QQ*s58|8se81`0oldo&8w0#U+PE!>cWVUJ8m#t{~5PDM*M+BwiAa}%&XFe5+ZQX z0X0zNxd(uJvg*V4W8BCHo7b-)Ke^qjo>z>yGY zQ3K#c{wz@0Gr2?0c0t^nVZ#eG@?<5BJ7alulPXWB#i2?o;meUKvC!2-|$;Fps@WSD)i&OjPm zteBIAzNmp|0^b1Y=>g8D`xGEOZIXg%VMDRD`J>SqHY>c!kq1#7^~1UoJ-6i%;(LwA zajCDohh=Xd>Gl&|L)c>>LK$LY_rE~|DTW8yhXWg#6Na8hG*9bBs1k1Gl>lOO1z34 zt2xXw7xl;kjLSBm?G*9~3*v%@jw0aR{IBJ8i;XG%CZ4Nice*u;G5dtCnC)nD+a@ck zN#~n_IF-M{z`zm9FJP|jz;}@#(0No(zAO_DTzTZsX9gf{RxG%ruL&{eC&F;bwgc*^ zBVKutmhhCUrCdJp?N>V9XeKYYgoFysy9Y!J36#ITk#I%`(o~uoKJV$(i_Nf5$r(~L@K9b<%1cS!R zPJLn7=Hj)G_6igNU1tpJN^NM`^nYsnXj%Ns{MgDMXLY!!O>oMk+_2P}lk2pPJjdL-%&JK{`8zDqk%FTd06a zej+#cr8@S-3TplO`GZ5(GwS%rR^7vdUQPPAQG~xzU8_0bkETBxEeTCKku#e=9)RtZ zV6-P_swZSvW0(=PW)J5fDfbauR@^>qu7A#n-%}I4@@{}L&Zu5K$nD`rI3KkE%YjVb zhSGFZPb50G=NCU+E)nb>8S7in%>Vs2h6$DozBsd~H92vhUwSrb_4Wuo^~HlxaC5$; zn);dT%#-t9F?B`|Q7$Y4hKd(W(Nh>hsM_y=BV4lVI-?uSyhX$#Qy2EK-#YSOO~fN9Xzo{T zNiq+t=iaJuBY?2Lhoy;($bD(F1^K-RA@IZ-6iSeowM^kCjOv<;*k=-zg^pVKmlw|J z1JSs{6FwL+%+~j>B4kCTp^C=~HY6F~1_d)e%nS5NV~U(uLY&%|%mo6yApb55`|8|< z$8FS<5!Z;nB+A$9 zV)x$7ht+7+i1um+v_#2@0ngG-NFbM<;hg;ad&D0R5!6PCjV7u} zGceU*)r0>FD^lAA71oBE+JC|Fh@l;=HgiDH)3JX;62Kec_WZ;1MOr|gyUp97~OtpSymov3|6Vayv0H#qpr%-*--4rx}HK=EA(+=-X373dvg)bsD2 znYQrkNkqo*qPceg%YT36-hwvJX;T6s z8wCcw3_1`D|_RGQITljS)|D z+wat)Rwx4S0CP#%(32%2W#I6Sc3&n9NI3$IPsD`G#yM zV|(uafumIg(g2u&y;>M^`JJ${RfDq9GceIV^t{YqIu z)mVIVhBF(aqhFt|R{h2>L}?|DOjBt|2s+O1oQ=G#DHwHoHDvt*$F5;HYaIAS1}dqL z3Lvjm5)>U5gt2Nf2CL9Cv0Gp8SUHfeN-DUzf0f?$z(~pNNc z8&zb+{n~j1Ch;8cMA7nV#1!$#X8KOZ0$$b(N@jh2;`itO+?eHGu;p|VM8TQW`m0!8 z;JjMX;@tqmZ6#|Oq(|mRVY7epQBHG)wQS2UcfWlHY9pt6=0;-&Ju+Ka>=IkZRXEiIGimpFi) z&FwH|=?A3h{2uWqpXMlooT<+_=iQr>x`hRqm-lZ+ z{5~R&_<@w?nHOzH!q9<_`^{t2ld?@@$th74HF-eBgp_&&^My4m3A_ zKbhs!Rzu2`!>SfV+}vnz0ms@vw6;)6-~|Z1GC1~;+Hc3$7C5yOFYY#4Fz`* zpEw;i2c8*di>3Z?#CT^q^J0V3Q~x6X-5PE8V7NsKxsriq{Qd($IJZtzWd;&9r5ebV z=FFU2Xt0WOVNOay*VKWG_V=1)(cmkAZz3vfAjsZcr{3}AtIzhDr~x~9ro_8&fcuC~ zxSpBq`DCo15Ny9$#L7*-afRqCK|L99PbH#!ZG&@}<~y4y$2Qqt2)=f58g;TR>3bwR zlO-;L;V@;|srOL0yMiO;n;&rc>f(D_^EVQq3rC6Yscn96YZ{Bi^NT_l%-3m}hRPtB z$;=k(UqyV5_?yTmb6l6uWM)$Kc9TNsD4?hsT)`*5LQoFyvG>U^wfSM@-jy9I+7Y@k zj`1TM?h6N&Sq3_xI4(eXI&jr{1q7YnhBCXvXFbBk3+ZcR!rB`JsEXNz3bJ&^6BzJ{ zSG*&wTY% z95G=X_&xO?@dG)5ajOU{?7rtDXjOVdC8M354F@J@INIxof~V?;N#hy;W-wlNc#l5f z8jSR$%t)&MR+jd%tJH^hhtr;aZJ=EyGN24~$^u7-g$W+@ipLkF zcV+1}3@8X-FX$JZIeL|406VnScS#mKSl9Xo4sdNoH}S70DtKuC%Q$O=6|O}j1v&1^ zdoso*;-`o|z_zIr*5DFG8cpWRGk0wNyBMXKhcccw&A{#>{$5i*D&Z~7 zT^*L9nA8W9IJyfML=^{5NR(nOjBjq?n~3sv5Pmz;g&ZtVUM!xv2*Wl!H}9i@;$>3; z5*9vyF? zFtZ-u$%bEv|2V9tspJ-VMi|lzBv6r4U+pX|DAd|w`bgFxxTM@WUzW%hpNhz+GM>mg z)qn<*6mPSTyJgQ}X?Jm=HhWqRbdXG7hJm$}8Wvt890Z{21@Iou}V4CU^RxtvF zip@?mIn-IIF-!6u4DZRRvNeZQ;p=uR;B&+mnyL=`ZiO~%yAekG;EXyiSlnpXQ}u*} z1Dz~h7fj@xY#&3}L(G0ApFQ zJ=l4WqM_$0eu?-qRC^l3FE&S@hX{~Y+8IPnqh}HEA>wZ`J*=mQr^W?UZR<@ASEskZ zDkb*6vvEpKj5|25?JqaFGy^nv(|N`$>T#U`1pC}_^n@NNOZ#Ui{KtsD>l3|2G=GTr zWXJGmW5=C&*p`9L#`DMsn!z-frByvQyrZS&=Kb86>mq-t@-lJ=w@XCW`$MR-+LYHJ;XYCoP-~hSg|8Y4KsCWOF~i)Z6I9@Z4%5Aws;gsE z)*+FR=7v|VJY{7PYjFX^CWy-o#CO(5F6_U{qkCFfTAIdiB#SY{Et(lXX{w8CQLg`iQfFQxT!=3FDC(RExto1S+t zkI^9M+(1AD$+OgV&o+Z>2PE+fPr){d)zqS~ZHx5?(|OAWDyg*$OCHwtB7%w`>)aCWp(c9W!13 zCgKx1uM_J$@DvjOW+YBFiT`NulBLo07xSm{Dtt*KIEnMyu|hi-wKv>}+~GC@kwIIw zruM2xC3=={zP0;=K^U#aLklJeRt}-^fZN=!&;P)E&KT+?;_pO!A8aIIwgDi^3h((Ge^BMt2)_e@qwN_o$URE9uzn-Y$&8jK`%dmE}$FK3wN2?DzCrrp9u zsvXGy6wG#nv4aO-Nr(neLP-pnY^ys=@VNO1zEmPII*9!brckcd7eD+e4l9(O>3|{ z5NU=eFKk|khCg6_-gsT|uU2?lMn#$i?=e|#*}U|ROgT(-xKgK%DE8)3?3aia=4fX4 z6V=BSt2gh#nJ^$>nz#fdXys-GpyVkhhRA7FCQVXmv?#*W_S*2=eojpAnaDm_FUsN! zluQEeKwvqW(SZXlc6v8dCI3VxY-<&mu)H^duE)^p?*V7fHNwMjRI!iF{S%GnFMKi547ng z#%1Unfo>O6M3T41I(kb7Fk`4k8)A{;PR~$NLjs_}RAgd7j}f2PS1D6UWG%O6xn%$| zSNG8hk1}at3UySZG+4%>zY@WASnM1FK5*+hv=HeYjBI)XAW}8#!Xxc0qu(%~5or4Y z4s1*xJJ>#5)@MjMi>pO&*A<+sr(uzZPkg_$d}YQVFHGM31N-p{8p7=Z(4n?@Kvg*M zmg2AsC>7F3M%yunw=oHVyw_j>@R|gW-a7Cbt2asx(Q)x+vxZTU9TX(bovq?vqm|PX zHX};Q2h3dD9c2DwhyL^%FZGf{bgn`Dn-M=n{0;geMti|{zW#EHO1QD2ON%oe;2%c7 z?Sg{P7w))Kh1{l~004jhNkl1$lD= zYX=i1R(^&i>+e*%V__pk^;gv#uzmmH<|c|houM9ylXuB5u8dAdv0Ve!e&gTX}nI};bE64WOEYQF3P73^8I z@=RakT!Fo(F!Z_3{09WX}AaC{>}#Swp8Kn$$>#R(1XD z3XEj>6;u7Yh;OFbI`b>%)2bCg5u;(L_X&tm`f|hvlz1HJ?g|3a@0Ra- zO5Lc32v;DVO~<@}0P|(5)Xpf?7GdRlJU&ExOd`>=Ocv%N^1v+Lo-N3zAZ^QVB^f9% z{Q`O4Yz`$OHa0Sd%9zL87#WpxdU&;sBChzq*|O)BmA`?t=H?8t^~K%zM$7zFcgXPD z(lqpI3aw)n)x1Mj8=W|xRW;FkOCBM!5lR!+sQ~1oIj|O~@J27?$ePV8f|!#$;*pmO z^w^l%#BR5sWP7BVIRS~39Jg4e_ggByUVxXD@PI7EDee0^mVC!YD;GFUyAL!}siV!|kNg zJmv*^VjAdVT?-ipL4yU5j94Br?d)^k&WOw(o;Pw?9fA+KVsMo6Ux8!oZ&6NL%q zN47q<9=Zds1AF6NZ)@lkIHiq&=?c4MSV&qy3_SBI5p&{*HKvCC2A4L$lP(Uht}Fy! zVm&9oeZ$D1w1%UnsDWZLUYY}=++b$0ZG*zg?Y-g!#pXBCuanIM-}tCT{sv4NxZAyD zbJcnpf@f}h|2%cD_kvidP@+nF#1!(&z!gsyDID@n;&p8 zDT|?nE%5afxwAy0-W0W=0-8XVgC%}aw9Ucf85XW^05f4EPmFq&jM`1OF_3q47cfB< zTsi6mRdWe}Mhajjrb(phFCEEbaBr3)&z^v8cl}QCJ4go%i`iS!>22vvjk|hu>f;r@!1-ae_cF<&K7?pVW`|y zZ~sdJ&}1xq&iBQKUKxSgW6Y_HO$Lygeyg?e)s-5?!)66A2yP8DcxeY+z17m%Z%d)2 z@xS7_O8&gDDtW9CpiEG8n;i&tGF~BnZxY~uz&r3#4S#h9sc^;Mw-p3oX^QfQk=Xn- zB5X2eo-9Gl)X@nSSy-AWiTV|zxUMtc9IBwg?_78>;RocQvnuxUdY*sApv5~v&M_ht_DW*KuN@jB24l>xJr@2j@A&EgIwmbv2ym+;aPm&xo^1-AfH zQuN!K+YeQ7`RW5@nnMjny5G68@J21zKsLnA}5vjNKCz&0)9vWKxtL$!9ct&{zjkQ zj%0sPL{;HZ6mbXpDZHVZ-=?rMFG01EGu~H!;Rp-!Em1M4RY-j1NQiB~WQ7d_OY?Wf z^_4M(JqP6!@eO>qVf`utjrfhL4Zt5s2p;JL8rFRXLJ4g>)Q&k$ z%ZWvsJSim0ZN+iQ8N_f$ZWEm31d z5Y^v%JINa?#AKG7o0n;AqY74}S*ybI2zz7cYkP%|2%7#q2h>>Mx&X>mTv?rK7jrBN z6-UYW8u&}Cv62nI=X=DrbrC8MFgAGH0jJklS^bl3Q~J>smR2s2gk^wRAJ~^HF<{AH zXUOzysBa!5(=bgS);PWZnyJcjv0|yJWaVb`(iV8n($N&`9W$0ytGlcz5Y3*DdQU1C z^^S5avbG8;xkY9MmL5)tp5JSBXj`2hNB{z`xDDF!D6Nqtmsh{ zYFev9h6PzS`Nfr68~de-OG^f#zN~Ekp$hcD(x(FJJV(4T9s0DMgjQITy4obb6GiCU zB&ow%hakE4GwAmL%j-zOI^xE4GXV62j+#9FIpUF^s~{_0JN*bZ)G$|XR0ZZGd@wQS z27nZ-*6fO!F0h(MkW z9@3f9k)E!Fn%X!6BShGh(2-l;>}a*NDtT<_viGK6=_&u-bB`@??y_#A%J_0g$+1AS z{w?Az;&&X(Z5FO4@vM*!NKsQFA)6Drx3w5~urhIjuaWdQv)2e-xYA+?z0XN#4VwB|v);yR#*5Nb?a}6l~7#L-nTbdT8`8iul zd2FJ$fn}?VAsI~}dgeHl>mumLn^yK#^3Rz^pt@OZ6{55cWe4;aptHh)rs@Go=u#2P zCF19ZuMxjzVA5bopz=FU7jO128Ey&I=&Xl1Gv!dRsbdf?a;5IGW2xj6Un7ES;Pxr? z1C6~pZ?ujUd6>f5JW?fK#xhiPP)d8b?T3gm;ssUJHV0c^uC+nwXAagtW%#-&1X2-c zu3g_*jL*z^E2dHI);L%Ne<6M0og9tDx6MZoB{RUhTuEY%n5XoN?+Tf;A1?$$Eu_ek zAQ&ugh}e#ned?*V51df(%y-t(d)xeb=z)BxCY~$+DxujZ;>r8h2E}$JN11>ZTIrOG zJVIb94^z(i2p4%-zYs?URn&LS0UCL!TFuY3YwtIIT!uPX8@gK4I$~3-RUA5~;z{nv zhAI$G2cVy9QEom#v7#efR3_e-cYbdTzl+ag)sKlr_Zi2|KJ@oQ$azs76BV0k0N!Oj z(sp-@_>m!P1wl5|QA|v*4Fauz!ac1`Vzg5G$t=MrEY_*y`NkVAcf3#~OCJ6y;(Nqj zns&YbcTRw&5f`?gU?#MQ9U0_kV<0Q~kxLH9u$f0r9HhNnWU^-b-V{AU;=?Px)^W~8 z_P>I~m3C!YMm?EQ#jn${9HoKT)ZyHu8gIOPZ+1NwlYQj)<`bu4@#$@RfVO*Y(~AOo z7}f?bax5BidIbdDnG#Kg1KTddOMD?OGyAH0?!ueRD*sw;TDRbWcgur4@7PD`W2ply zdF;SxskgPXi8eXu&xQdFobo%!@S}0FA6e#-pI+1e3Pe78YZUa_fC@!@0wOZId9klj z0kzirrb@;Jhugx_3}|JgfG-Wl1lrPD5+?_k8(viO6bq17%<{lx-Iw!;~l|7_~N z4H1J@wY0=%CY9z(6&#cMB#ZXzE0xI%@9&L*R2{ikU%$j}wwOv`abs92ZPL$Z+0ly) zW9zM_LnctH9q6pMJXp%v(RgsBypsX%HhSsY4RvhQ8*{OP1$vA4vY7{ryYCb*DA0pK;_PsF;+*rLLEvz1w1HUn60ZnL$a;(@DF zDv+*!wlvA!iXmR{$_++c5&$)s*$b{s%u))N&gXB2{f!?>)E zE`CCJ>I;9nMsuMW%ES$B$Ut2Q*jn{SwxCElaLLI^Eg2V z)?8WHRO)~#rjZUX#s{OmJ#o3NSv-lB&$!#%-imRtN#Xrw`jitHA8@fXobQ0}{Q+(2 zcZ!MICN#n`KjRUO_>~vPs)wWqI?E_O5dd;lZh$x_EHggAa@;-@tJPTzWZ>`Rrban$ z3V}oc{fSg|H zZDz5C5NP$)Y?KzkD$`2sImwAACOCa5i_n`0II{-EmQLW7gZVCps|T4(?>?D8q%a$( z5)hl?F@#;u5pT=M9yzM8Z$=F*Z1@U%FQ~@tp@fwX^zZRkDlM1yU3@F zhZjC8P4?8&6!{5+aZXyPZhZhMPpg+_;(s+=b-fj*-UYEz#1!CVA;3MP#6Dp`3X;Ez z4UQ;`$QZ2IX?PDlNbmd_F<~?pq%jYcyPVg<&<0cadHZ{3uK{Su$lul}*km;FU^HxL zjOjJvJEL6}1~8d?|7yFe6t*CwPJo~d6;O)<90&k=3xfl*3jP%FWvylCiLaj`CTbmf z(9E-uY7$%BJ{gwu#YSKY{!ZA%el-{R_Wj!06jYB2g2_j=NvZLM40qyP2|vQK35nF$ z!f5d<{it3o2P`4i#dJZkzuQj?FfU)R<$*GUgj8V_==7sLjIi!gsrv4@=N+ZcZTs&n zWyx(8QODB>dzMKI$|(bh1HYI{ttgyk_JY9Fe#3a;`TVx4xq-6@{P-KAPy$D%4eR;C zicad^>IEH5Y~Mor6_nUJfO1OVW)`bYR2d#jnx)#vVRH_|>^A=p@%O;FbezQ<(L>Df zxQ2nQL%R|SRWdRbh(fWy)Q zB=|j8jcR_3?-X(G)(wol`g3%CC4U^aenSO#e&X}G2|+eoV4UCZJ8Hv|%DjpAgrNQA zKnGCAcZz*JEFrfd?DahgB_~@ANUj42R)kp}@k_*C`Rk-)0JFmAi8`i)ZI_jMNJaEY zA=O3G)Ap0GiW%y^-r~vP`nhe=NVuCr#|y}IXZN-7ne|`0h!0E=`W|t$7ZvyRYNAxf zy~9dh`R9^YEea!NU-N{6mpycb;8a*l2fj#p!9~ERp zyXI@}n?a$JgxJP9Sndg(rLhqCdnT0Qw3Lt*fa`|;)UIz~;z+i5|9Yvol=qh$Yw`Vo z1gQY43U^74)WeZ&P>31v>GyOJkMIb4!$XXLG zXrppU4_DAcxpu%UgEvacy2gmF=&MzS+orIzB|WKjjRO-s|Hwu>zSl_@Y_DpcXH4(N zOv!#T1)ezW4|sg#*(K+3Fj$)&0EwV=Rtprzd;WI5$>EEC$r+WTQk)1}<%j3ZISgTJ z%baH3+i?p<<5Y0NMREWutZI*cJ=!q00y?Cyw|gyofRuBi{mBj@pyh|&HZ|D6CSUAb zRyj!G0`rupH#b-Wt=WY-zItc&9PK?^)Plk(I5Q`P?mrj-?^V5bWw?sz74HrBf;mZl zC}DNFej{=7h5oaOJL;|PTm(^i@ycGKd9{;286N0BLiJ`Rb7h?AuyoXk@ML(;gu`6u z$T>YQ6SlB+u0;|g+E}%c*+?4GpUq%wus-`tp6j-Ul}VIZVn=B%sb-=_+Ra^=HlXB` z4y5*(A+u>Sh-VOecZOr6e#|YrH1F|GLDn#z(r1FTuv2fP%q;nShhALrf-8VJ;03B> zyTam=VTQdinFX2A4)`}GXn(NDkhkxbXr@Y=fuGrorlYA>i{$$w!@M3ahT05OeDpmw zR43n8p+SH3K;l(c9w=72+YA6170?`ac{EAfk)$|#?|lHxa!875S9?>mjI@kxfUw5e z3{<+(fSC@l%j>#~G=aL$O9Ao7-ARC{=$9qyCj=*kOpsq-w9i$=dWerV2z!RNG2*f4qw*Jr?Q@@H(uJI7`lfPzOxC98v!)4 zZj!=$;hAdoflb-2;bj&E^N5f3Qn}7lNE;YH!padIqXlvAtnj;$do3ElCd)-ti1v7? zn_TE<6`g8)MQg$O4pPVqWvLKivf7bk-o7vnxi-AN&bI*K49Ic-Wr2Im=~-Mv(9YB@ zHCe2Fv}e-~T;nz2&<8Yl(wj>^b;bG*u=jtxsbZ>t4?8$yag}p}Nymn%IJ#JcHsWi< z56leQUl20DOy-aqtYAh^NwN)~H_MJN$nD%Q_d zbig;hz-Vlf2ZeP%U+phV0D#~O7V*wUM3ioIva(!ZUES#;^byYyUq~${PPt-$#5?__ z{VI3M_^aM~vE$+cxx&W!7ZhQ0%(EnkIFeai-g_ToW=F7KFFV%%0YlC#%UuxIQvDxI z%|3F98$7w{kQ+LVSh8R$X3U+Bja83SqP<;T0RO}Wi5%dsM&xYS8oh}B7Q`6PNvWxAFrmwK5jryq%h>aQLp?D+C@nxN z0)Wa-SBroxYT(fX4&!?Wq_>;7vSJTBWY1;N2k4kxSlRh#EK(N6$8LP&!^28p_h^RZ zKUW)~=*7kTi@a(vrW~iCwCGAMwE&SBe)9r(6#y3{@Qt-lcwcxS_YgZ~O z8y_jPqzHVps;}q)C1HaNTFEyhJnlvYh?oihHL#+uJn4oH9?AEDOAXARtMCYC7=Tz` zzXT;o^*>oclVhOwzY&8KpI@oNs@UI|16BaI-9m@DKVQ!OQN?F4VRyM+LILN&I*}!y zmFE$$zYzegcm>gJy){H+L?1oN;>+&h=i~7UY|UgOhH6Xq4ge=RrzHvAZaI2W2H#p` zePN!zVl-7lzQJ|({HMlRB~t^)%wiO#--y8XkSbShHP#XQYOXiQ@FWMC-m3s-?AriK zU0A(MsHL-nBJ(ecB~^fQOe7BZ8u1eG^ZK=-^S@ki1CL~jBRN&ZbG7S6CjG>OJV-Le zEBccMo^fZ%-sCHE&l|xeysz%CWNFS=%Z3)E=5V^N&A`MX=j69OgVpE{IH(uH1B%`Y6;aRCF4SEP)|;QP3hk!-2MF|H??9{w<|k`1{)Q@fasQHuXKZqg zR~4x@MSNvDQz4be&l82$wG6vlD&jVdaKTY+setc*1c61!NiMcPgk4SBpB7xy`@|A# zq)!uB&j8#RE%8@F$7I8O)0>~q&zwF!TUYzrvX@kJcq+l`x%oh^v4_HHsG!eV`$NR1 zq=tEo_!{wx!R^T!2|MH~1zF1NATI(y1x#+u4GbKnBQd34#{dO;6>-&x=bK&<#~kS2 z2yn%h%x+4tMLNy%`WR$Jv1yCubZl=-;uS!ia&QHC>fQ#%yf8IF%*n(a9+M)l3{a$P zMN}IJiKRVku?`>-&E^&W&j1?zwSJRA({WeTQQIp~NRMr2>l9l1FBzSXat5(}zyWrf z2YBKN1ESl)Uc^0Dvp!f=C*3?XPX$TPk%HPQE~Vt0J|q;_Wc19;G9OHJEm#*Qo$9p@ z$k3pk?NYA1k)lGR%EEyMV*NSLz44>cVyUHm8~zSdU`w94!Q;tKU@g|SAywa{OP#9_?0j`kmg8pbXd-k&)3f{&mIvqT&5cM zgKu`c@%Ki-stB)?Nez2@^GLm2c~OTJn#x(>D{Hy4^r2@vrcLx#a|?lOl-9Q`jnc`a z__t4PhA}(b-=V{343&1q|E2q1vhymGoDFogmYFXKSPL`A04N<6cQjw6PP&aMx)O(z zwbPWq4{0$fI!T-LG03g^?u}`b>cVM(uh;c3eto~$i0V)q_0?DhWc9S2MNTdIYzV040m+0+fh8H##8PgtRxZ^8)^%SdLTbSjZ-! zfC>xL6JF}V6eIN%9JtMmPqCclA<)%Q*=~$ZSS-P0Gd5ZP;GZL|Xn9WD^Cv40ZWK|E zcIDD&)4==X=<7E|xhTi3I4zY?yQ3kIV(o*4Y34lUj$>Q07fPQ7=)eVr>=`y%Td#50 z?9!HB*^uz)GtdNc;2>2z^JL>f#x;wSnckzJdf8DlkWp2eI*g1hZP>`h`p0roR%M-w zCXkyM3SEdQ*qZ>awGyG~IEx5+iuf4>@`2ZUD|UyW_)KdH@uDu#XJ-C?nQgdG6$vaIY(Y`tnq8wbD@Yq^2&i_x?7 zW(8L;L+Np6o!D( zR~vR4g!GVEvbAi$pva|@SEUv_82eV#PsjdkTATK8`6C7hPfv-GIznMqqUow)2 z$%e%53<_;iC_187Dq!kGksRj@f3#3B#Zva6&jHf5MOIrvvTbZ&!Qt+#g(!>y7+JRS zdIK|n$49=P%FeflZxIvP{bI=x-PT`Sc;Bp_`p9wr1}{-s1~ny1Al=^H)RO~QLumrw z^dk=Rfx>%h`WbNsV&C^`3Z>(SitV{)Q$7)TFA?7Z2Bp(A+a z%uB!1;d-Se*^c2kyvUpN5~{CsVma0BvB+TLJPIlnBOWPCnw2AUzQwRb4IqOm_-I_f z-puX7)@|s}Gu{WV=eK0&0x{o6hl<38g>-u>Qp-^ETNAJ{<#2_4bNq9~c6B`DiE$tY zyQ#(L(NAV3YMJmp8KlyzeHRjey|Zd8m%&a`t=+M+2GDZk0S-)gCW?k%2tuw zf_$j7ma2nR81@@`yja4?8XV3xQeSIc*;s|+CV?JyXf8&(WQJq!fKQ^d!>>QEiVb$F z%r`7t9U^{QXJ931e~0TUDC4|9?Mb{+tQ}|=1^GrECsc7}o>`a!N&6TYWSHul{3YT` z#NSV4UKHG0GD}B1M{rI$dxhSM(E&NBcxpUK)4!XhVz$T~m1XruT-NAA+~_aRaYwG- zLD;-O@X9W38BoS*7a%4L!RYy`dz6sX+WW~*lq|#y$baS!doWmwb*G1?qB!(`8|of+doMpoJ!KCvy0X?9A8tJG&6Zv1fhv0gK^xPvJ>23EWMuMv|U_n8QScxES!Q%wak|6;alkaEd zRyv3P-(s~m)I8$`g2?PrHF&GR*IluY8f#A}gfZfocg6MK-Ha!yXe{n*j~5QC=Deb#-FHwTq$? zHx|DJI%HNdo|E0bj`$_wCkB-cn;^cR02QEw2TuInfb)FYX(#;5KfSPEN8o2|FBj2q z+AboGKxTf`Oo=$s|+;s&FLN^(4oD1v`>5>`jzaU{q&Ag&QAr z8g*1Jo+3JXmGj7->^2zy>93Ap;L$pmN+8YINRZoSrvAT21W9;pD5J3HLRY--ujel< z0?Txydd*>kW;$ZNhd-~Y@apy%DHjO)N?>%Lrq?C`q<~&IvKeB`<{taq@n68YvFhcNFiXe3xZ z|JQ>B)6ve>?Qgfpp6)0)>x8W@40Bq{=&K5||OIuZG&*fV}sFzLHwyb|Y72=~IqBX_swl@-zK< z{bxHl8a;Pq9$7^-bPt7?)- zDdL;8EM;uok>6@JYe4e}{~Ym=WVJHoUS$qvS~nqtVZZ~)!fV8z_>&oKotd!sar1>n zld)(4k(V)7E3-D>w}eHTIA^bj+$yrZ?__O_HDoM4r60hnwc#mMA6T9iPx`i|@n*u} z3BU6OX?Nt=lz&UhlHzzz-gQHzCx{SNI+s~lf5~rfiR>7r9R?Y#Fu2G`Up$78aM}pgFNe6!+v~Baofi0JkL=1Rjc zX8z&uUinxonAGrdt*ys31o_##?EAGR>zVrSgvUEjy-|vvf#Eh-s1XWgw6R-y&Wy73 zWevam{LYqhfk#jVbs=lHTfb08B8C~|9~$jd6a@E)y9M%b0YUGQNgsVjZ26s3Ij^Ec7*CnatsT^D*=C%*<7QJh1?s^N2)@;>5-%&23hZvj9QRA-AH)(zs4hYVQ1P(ljCPBi>9B#skl#mRVqe$$7BP2QPs zgC13L115_l7+L2#!o~c}PX?|#n@F?pftmgMgqO|nq&rINXDoVyKg(b{5AbI@-2FfS zLS`zOaB=@LN@;kIoxy*VH**|8%E4zGj2+QuvYAgULz&CpFdkv5(q3B75=naq$`5cI z3;P|cbXM6}(tRl;+>yuh9sp{bq!fOSoW&Ni{R~20oZp2Ev<180;de$$Y6}3xiPtYL zR2)hQa_TVpZE>)JDV|dVl3Qs~WST~6R1r@RUn%Qt^h1uO){!x`)=XI2P~VLr&`i<& z5$sSe)zlde*w|6Medhk&CwhPj6j2#nG`}Mix%{>kr2`6y($=3Ve|M4Kum9Pl!5#Ty zP9$$F>zE6L&xx3=uvA-Do2&v=S}?6PZ(b})30|q#ksx$kU`P+iuVh%oiPi3?fKJ?+ zQj8P!vZ!I~D9v7MAm_mA1$J2U^XBOlF0Clkyg@-^w)pBwF4hi2h(YEd1wm;$Csy1rPjY%}Fw(MG^gt9;1re?tlw>)}^-s$l@c~ucHsDwFg!U zYJ;2E#xTK}@2ui8>w2EiUhFyJqkmHgZexf^)svpno{`fY*Lk4qZ?aUXGi-Pu`ain! zUKD|ABA})x2Ik2YG#ef{Ul zKG4qbm8oiSnnxNfaz8yE7~1uMdhUVd^oD0_EhlT&K4btRTx>E&X`_iIDE(|R2IPji zq+(G8Q^c-xejDEp%Lc-rgDM0c=nqI)v zbxdTS8MIf;!}rUvOF6(RL*24aaZCoi1_}t%vW;}IuHa$Q{`G9qv_I2*ec*ukPoyhf zQZGJJs~f-|`=y#9?L?nUu6)mL?F>UyM4}gf%{KZOIVzJCo_id-SkN|kr*seiPN&#KpO%k&<+Q!;kP1qQin6Ihm4p z4#sICt**Qy$EmrRI$+dmQ@Hakb@vqksxjS=j^Rja&u7b@77nB|Vnd>WE5asqxeNYo zq_BgDsD-yXz^%^kakG`s()cEl@7eF3`6VJyPttN9zZ&} zk;w?$o+sLtQHI5UoYDJ#f>gV=szhao@7L?sNDZ!*ybdHpLFf)B^;Cs5AtBL5w5+*? z*HlE`?HWwrx^>Vho~SqPqkgZ&5=y?VcGocDt0@IbY_dS0s;hn@xbBVm@1enGI`d}~ zZRuwMF(_zXmqVp8i-!vePa=rHGAkVIW*N-GTyA1-Y`RC!M*WFf#tkJAD@c>>TlP#4nu9+0=Fxw=2d1cbn)ZjN8%$ z3nFF@XXez%!t82#!`^VFHNHfkqs1esn!C98h}(0SNlE)nPZ-PI3&2`JrBAiBOdXdc z#S}xy_|$Wze7yh+20mOe>ctiY9%jVn2#Fqmv1%c^ee;H+6ecn&))s)cGa7-yoc!Vd z9PJIVVk8Q(@C#u{iiR|W>I)t#yL zpA!meVroK$(+?}<7v^`b#FI4r$gI6+#L*Te2b*{F9ygjMaXoD&M;vfr)Lg}jW)92Y z9Fr{fh)VMacAMDL0ol=&$q*aHrwcws1b2J1zp9`uC$-Ju?MM(>GMkY%tQOpMTE>|$ zo|;Tme&37%5sC6>Qwef9D>iC?9U*cgj5g-31zM-2Uhvl&3mlddFb4v_ z)i8_%lD~e^8bHf>T<{x1#1o>P2gb}Sq_i3IPMFO3L_%?*AgAW)jUv#*0IZIzK{Qy# zVLURLKs(X8nzb7*;hzd!+O$6W_HI!kEK)=J%4sgAby`cN&h&P1d zhfNN;G|!;wL(ghu>_~|NTu>mAYV1&xU(kH~eW0rE%M4=SLj=Dc%}cg8-HS!F(aLD) zBS>lsJVxA8WfdYa3p++pDMAffWO$;A8r%SdlI(qB-s_c7^RJQ z9ZXttyZ#mZi^7ehMLADC8iJqJdl2l5hW8w~=IjzS0>3^5{9AH9q#jgzZfvH2vkiWw z;q4VeFL6oVsP9kfC6s<4<3_^f%CzQE?|%dSx8U$QJ6}BunGvBkM9{m4A5h8W5kKOI z9{AnThKhePI_uZ(@A2$`f4GNED%iQbkwU}z=0`xNc1UWmlJnf)8`=cMj3~tJQH{?{ z*B>K}3`vv>a{~i)W-}^)sC%~t0PHyn8wk{eTtX5r1b{seTJdBdLmU}o^NBj_7r@{> z#i)@@mBM$if1e{hkhq?B{)rg$%8k_P=%-%^OikgcVWQY*e_^9~HL`d$-A>&yU%$ur zYaq$uAc|@mfkAI*AA$mmzEH0-u+h{u4NusDm3P?B)fOdB0Guko($mhI<#*n)y=Uj^ zwTyT~dvmmjK;kYc%2Q8k=e>B640jbgLk?t|U`6Fv7vCI=Ji1!n@Ehmx%9xbhdZKWK z?(-2QUDBzXD~~9#w%JeRX0z+6aD6))GhMQJ4>o3I@d&M*43WV**y|GvqX@2r0g2TT zopoI5z+Ww=+IWfrdhC@fuClC2I#rcO)Ag$eqbiW?*n)=ggxHz1i)_SWkM_^%{gdx0IG^}@hVqx!%> zoVWVLTF|o1KAu@H^Gn2ETH@CMS|+PWtD`J^{?^pjS3rx(GoxkMv%+k7g*jK zhS(Y}oKhgF()85S-$()*8*O#9xwi`w65-MKL>xJe}I{_`TqJUJUlpdiex&lcIGy0708p{uSlnv2OuI4>N)cu&6){5{oMrH2)x zF>$1%!(RB{%-SDH3NIIY4gC=ORJ}Df-9HZpdsaP4iTToLxdn3fZKKb zUi1kO^_e3Y+>3Ib7mQ-se52W9;WCf-hF(AdTq*HpQs=J|B5Abu1yzI|#SIlSy~`n` z0CVp4jzFbSbh*SG+eJK)tCa!53pb)Qo;cErpC}+kX1)4g!Js7u9hQ{t;^s>m-ClAE zy%pD~iz#XRs^ z9qw^7;JsZ$4YKCF)lg&z&kP57Sf{@k(QifqYFcm1*A9gKC$M^2h7!B+*i-48&fbhHeS{2my~D-J=}Y-C zkVj<+2pKIae2n-E=MymAoee|pk^?9X{f)wBMwxX!Yu_TiM%>{#-!=n4YuWA@)%Jtc z1P62Gk$iukcHfyyx@ONap0QsF85RiWjj08yx@Sw}%K*yWKENtMmP@=C@Hczx_rzu` zOlagyC9W2lji)lBym0K&{hMoLjWH}8<}5`>tkb{5hm5SSB&A1-;SC(=wD!T3Rs2f0 zByVYb;TFn-%ghMwjfE~{{UwEsH!G;82s-2Yqa_Fv!t9!0S`$OZ#CuCYQQD-~!4@TB z@w)XZT?3!@#I&9YH=e=hTzU9wqJh=@=j`)fZ-tfHQ8s&8N1x1mo21tj4TlFx3M0Pk zC3z{G);~85_6`$xC5hjejo;SqCIc-AP%45>CZ_abcwPGt!H>zFr?ARe*ocQWdwb@u z7k4vu{X=ibf5BnAu490TIA?Qu`q4e8Wr7$^RWLK~|IA*W2xgU~rgIx#zUjbZ!xJWw zEfu48XrwOdVRYKVhKSkYgD#{@4Kd(qtsS>dv7AOZK$XlFd1cw-Jr->Opee`7fQ<9s z{((|>w<#gY%(KpG*#ZkE>m@5#n#tXmR0mIZzrx~%hkhaWR0T2EoZ+7J8#%NSi&t|0 z8@+6bK?fH6k>CAp<$ss+EF8#WA}uMO8f$E=tw(Q4B9X+Qt(~F)AzfqOE_=`ZOMC6s z6$SrK2r8cVYm;@GF21i0qk>gDGr{Cw^D)#^w*cYt$QOEIx z-kCj7t+j!1H~Z;$kMN1(si`j_F&djjlC5J`AevH_Wag24(RNxe&~dXXRyWoXlK7+VjQBqHRnO1ho$K zCTZ)8Qobc&8D*vezDA{w*&lxdR=Wf#{mfO8FA)V!r$x+rhhc|}5=e_+8B>TAaTO?9 z%bA7A77l1ci?X*-LkfC5MYO>rBc|lRTjTq^90k;8p_H@ z`9x$53Hm5y{c4iAvm(yO-FM8!@|@uIDXq?2nf!Xd>GV+F#af3OobuLO;vSCex`nN%XQNho3iKOO2q`pjLz!}I!D5j0Uw1LWv#j4!&uvYaPbx+6YjZU z#VYKjDy9+&^l0c<`9{l^x7)x40Bd7woRKXJQ2U65+eZ{wyX5Q&ywF$l1^4jmseHwg zXthkJ?lrIwM0rk=U<9Gy~~RMldW{ zp!u3VSc^fr1@o<*=!3W;3GD1gFUiW^C?I*}lHG63^^`OL%qJmLaw!W_OX}XT@EqVi zM%-@-3EGHn5r2-D7(A$YWy?0GiMiNB5QF7r7m}wSbG%xb)vgCZP8dy3lo3~!SwTv9 z>2)&;n=qpW()48BN39)IvUswA3*62b0`#N3M@4$f%qB+s`sr-~L2!)ZeL7|o14a}i3ow3wbBy~1h*80)6g=?Nc4u*M$(Ak< zfjI|bj|pz`4?+4i>(Ab7D3WlC3TM4dgcnof591j!AX^CHy&0AoNPx_zAbpP4H3Ia%LCce_lR#1Ie}IE zIbCdhI!?Psi5l_YEN}kI9xY(pWSuf$y7l=is3WzvR@qd>K9ZM4#B;-yb;nPsuna@pn5CX_PVdk+RWElC055}-e*4I&SwmNI1x_$Y&BtVs9o+z1?Jnrz`Ja-Y; z0iCe7@ZaN=uduC)1uW=v&_P!@T_L;RiJlYw#ebzTwZ=f^ogvF_I9nSBIHyq2F_v&o zW+EbF0;T=9ZfD8?VLj_7@*prhF|4T!wB>E<0h4%#$5$n!phB|2e+78yCu?4tO>aha zwaMrNR76kyX800>8Rm@i!rp78h3wY|cEaiT zg3A+V^9Fusmr!tFHLQv{ls~zxd9acF=J)5NdS>pKinx>U z6{DpB3l}1Kx@FW?Z6%#xcfwvtnr85l$nR)JlawVV`VVCJ8~$rDiRzUvd+zqp`moY6 zgv>j$ds8u^f_YL4<2yxQ&A(CpUD!5s#@25{&>s10Wih1)swer7K>Mp5_= z6*dSfx4Fgxv|Eh>>3s2Ws1B!bodrEnC6r1aO}}0uzVYgTAXDq9AWxQ=51V&>iTHEG zUqku^MA)7Xf@E4nxQ1i4Ox%1CPa20C+Bo_9YRp}bw ze@|3766zLpa8*1+fk}l_K}GcDgJaEtj;6l+z?nIM81=Us{(pl-z96KVs|<6wCnxWjtTMjI!b1ESOXGlyQvKn)q5tg*h`0yoeAovL#+Wz`IY7SB+WHWStSW_4AmP z!Pvfn_j-d8J3{@U3{cxN!l2CF86BtofPQ`dUc9GyR7A@R{t8g* zJc_zauRnrYcGkF&8Qw2hxzglBsSgssR`>1I>bNbb?AK%Sx&|?UM-K_mO(2Zp8y&kIiV_Do!R4f4}gr2r>ejYoYp;oH8ZAiq2e0JIvaSd zlHaY3k?G@|k2)}VTt&s7?+O3BrboC$Pc7 z+{n>7DFwwE>+M`PaFM(7_fK}P=XE<+p)BP<=U?h5wU^kZG@Oz|w(lW0$sTql;W*&5 zN=u9SLE}mOS$*;ukWQ0X$G(PnR`<&B!w|JdQBE zyYyH~NMwdmLDc4h2ZsOLk)OQTt8Z=>FEQBBM!G3t*1%5{7q?YdIx!W~5Uz}vkPK2P zsv<#<1Fk!$WOSxzsx0}JV@sD{Z$+De#*4{}x~02oLj|;t_%Y%qdLEU>oJ>Jq;{!Jw zSV!sP9;fNa(tN=?3^^4c^vt{Xv;=1-^Q03eZzBVlclh@aKX9v3v6Z%Fk|=CYxJQ$G z^j35^k^Yx9Sa5?vyh7AB*zagyTN1W#y_ig_&47MlV z)%Y95volO%Dk4!I#$YgLU>-qZ8d|Nz&Fp@?f!gFL1-hXECcY-GD9DR8Xsp1?W~@Y* zJhDTjn=Rq$uU{7bq5g*gPmo!A+ab|F$$$qqpB+Zi8CcBlXT@d~Jy~hBLzAvp)0^^0 zM9{*Gj2w40X*G ztVq1(K%5*lGk}g^9b?3oh<}H9e2q00=Tyt~PQg`}U15+v=g@{}B1YC?v44^e>Wc$=`A5COe&qi*NL#uTHou5xqppO zICt*_U{4d|Vh8fgC+OIW@b=N)hY?G)%)FdC4F0>l6If=SqSpdzo2=DN74+PogDTT$ z1${RAr-(B{k>r(wIWU=3Y;QhXmwID-#S;#EXIP;FRc>V9VL5J?Z*|p73l($kL3&XG zD7njqyR4{s_EzFKv#Kct5o}t;(5~F@o-%3`ac|!*EIYX4pecDDF@PFBy|Doqp6tY4 zzYy#?nvrY7K4~LKZ_V1BQH%#`;FalaxQ=tgmx!n3>!d3}Qg0)@FYDJ~g$00yELi;# zodJR)eMza7qQ|%!2>de!^Zwo1nj-SZmgdwh|X9qi&=^2)-kMORe)#O|DJP_k;-2XrZ_=uyKiHtk& z(3N?DSu%|^V{fFC2xD9weg+z!E@^~R^e<@@EAWb)4HOwT2pjg3)4Vo+-psKU)(L!g z?|mAX6;sZ($RY(N0Ofw&3A~H=G2#yq6~&m-a)Rg?i+DHvnwH8Rm9eDUY0VMbuOFee zO}>(KUO3Ky{paK=i#~!H@tV+eXI&q)`QY#tAp3LXwDiSVutDz`f8*qvk2KeVl23?=R-y zK=hFKCM|NYVpUoqI1+L6gt}JO4y3y~A7v-n2HZhq38blrcIP1e)5#T*C5A|M&N9HeZsUIeUW(qCrq@t0LPy|rza~KsTo^`D!Km>UvXic z;E769i(R-lKM4Re?{k!9OR?rR)C}(|$2NsKo7zeV?#NRpkDMXZ68oP>w$4b(WnoAB zN=sV)UjU2A*Z&BGB&p4n$5ADyTMv=&pLmS;yS=idLRh}%b8`QUd=IXw{lE`p(CDL6%W;;Mk@g1Dx$Eptu5=kA#ZK$ zZ3o*FTuHuHS{K}f|8%2#uGXep1R-S6811Rrui76PL-6+{7>yhINyAc$wTlb6LTzqS zSRF1V%ziKI%L8a~SW{?X07SKF71ao~=PdVGE5dBhcu|52@$Gq#9qPP(T|jXkHju#| zP13C%%aSM0SxCi)8fza&_Ktw59jat`nq>c_j!@lYZ{r_!9LjACbntsiC1OH_Ci_vB zgm)w+x6nj!Wr-Eqb@0!Yc<>j?cF%c1jA*yi6Cul5I!g1Nv%XQSa8j5JSy zr2?2uyr9CDxXl2Zea;YcSa7*n7t51D;lYmXw0VGQmT0D4JXtT=#2p1OAo=nRZPCeO zSe*%$b})1Yz`?No{@&=q0{H1T%$Iq?fO-avEe8CJSe^W@^t!!%n` zk!*0d?+}>c86=hJtPJoH@lU1;pEA(Pva&3mf@q|b_&D&oJO zrF+C?UdgF$Ow8>zDNyZVXA_|$>S;)5CQ|vlh+2~GKC(g`PRB(YEhd;dAQj+_fK7d; z0FpddJ#OB-`5OX*(lU)3w{U<+xveM4&vptM+NpY6x^$N}0Qp09enju88eEWg{(wtq z2q3TP8jKF_0EX!}%hDwDCavl+4BO)>AHc8$uDeJ_NsN}O-ai>HvH+-0)^c(~5}xyi zi3;dP#7}!|j!S#4*u%Wp%0JN|6OlYUJg2r_2Y-dOK2W$w)g6lXL;wLAXkR zOV2k)qqKJ{Pcn_6fUp{Epr!hHc4R}6841cK2VgR}UGUg~LRz!S`h_T3f-uigaLs+% z=AkfX{%3bU*N8tO{@Y#`S(*N!gP_QqkZ-2Jow>5qHPX0DZhUTn->Dm15S~0!dZ|qvcUj9&JA2`L0|{}% z76+q)8_0qC{R@)}D=ppxJF2YjJ>rk`EHc%>-_4N-H$}ux`h1UoTYWI7@KO_2xM|=IR+Sso6hFnG%);IW%hybs& z0I!1NX$LINdnrZ?++X@Lp_mn>+Dyp*>G~4nPJj=61OhKQwh(Fg%`0`8Fh(EwB zH3Hg23s)MQyHUZH3%GJNTfFyT1y9zzcpyK@Vc>+39$12oS0ss4I#EL!KJwvLF1~zD zHpI7q)Y=fCC<&gdeW3u*Ux=ss^&90xa$QKmcMPDLjZLYon3rcT1xvLv-E3*NWiWz0 z`P|*0wy5KUsG?9x8Xy-L;eSW`bNyPA#j`WPJ98qbp^8PiIGBz59r-5&ytfOLX|d zx+XfP-ov_x8FmX-0@Vvg?_flt)Xcerr)F5@1LebtTEk=zt+jmjzWFzs-5@TZu+c^h z&$ZnHJf)L!l{KBQC9Pk@^%4Ly!B(AYTJnfLQQ|XL4{`GR4cJYEuI2Fkim3(apbi_) zKT>&0!lrJV!ED?SaAE?1yXhf04yKlC?7lWAB$c_t8i#tT2hG%eubbXCNY*})-OYrX zikPn+IL5Es@Chz8tSw?DwjS01t}w@N!fo8FHlP&e&0T2H2k>NrNILX01$M8@ zG1oEc&N@8)fzvaAgU>WjE`IJMbK^A{*bEw4xFcodH#ReONsuv!x97DaUPbZc!kwG5V!X0Vfg+O%bq_Vq}dts zqMnL!9X$jtvMBumZFS}h9yTRFB5Wp;`z<%QtofsHY_aw}1xxs7f`G|p>rKGY$~3o0 z27HM4N`0#&5>D1Rxidhx4I&GJ(qqK4%|1|yF>OvFENoK79p-?wu*rt6$?;jrEnGT8tii>(5UvOp70uiC|5zr zKQsmzO6tRBa9C@hHvdSnKJxP??!2*O+R$M3Bo3dg+|%+>_#2xtZ^Fld3TA`H>Z$3= zOKHuc0_!h-~b(W)K2=7uNY#F<)k|rOg}J3U2$E5<^Yg91V3JEkfq&ClLR_JnK+x38-IEhO<-$Kbu8Y`_BBILhrcg^~=Xt~XFD%U8BTguK-T^*mKwxRq!iEx7#Q{$< zWVEV*6l*ss4zV&axdc;Pa4F5^EM)gV7Z&nI#LZrc6bvapaC+u!{Tr*djpQiDh_9fQ zk~wLAW_5H&FRhKwka>4J|Fp(V{;AA+by$vcA+ya0Dl+e{F231b`gpJrCkMb?MM-GR zt}K1%nT8T+_f@++TCSPLP?bRnGB{zC9xm!iRPHxdVK(hSWwdFSa;e{p{_;B`equ0x z_)El-6_AeW02mg7cDU;UrDT<#ODkqwvj8BZI~txW?fuZTQSIHD1uczeKilAmAT}+T z_0M$|Y?wVvcE}eJ#m%=lMTuB--Jhe%M5r6Z1)T7~nRAR+ytZr1m*kHE+jl z&e=D2EK-;zy=N4_8Hx3{sRM9yUsxBISZT>on$74HrZwK#t87|+RQm8Y24!5=>)+y{ zCX2y6uIEUREAN6T=s7zvHB!AdXUSVK zX1_IZtP)rrHj5`YypmW}9Qzj%!U2M3Z;G7l8o;9+l|5RXcWzUD>9*d*0@qvPy`K=F z)U5d#O-5u%_&|Idfg370ch&?@kjiY2;yBnG+A0NOBfz*e7@O?B!eV6uWlrQl2hfjt z(USAG5WD{~;+KfOv!fm5_a|V06byF&k{F{(rX?dEMvE7<6?ca<2UXxAD?3Rtj0tBl zk;sgGS&5}blRc>2aK!ICTB4C3fCdVx#^~O$#8<-8WHq^5Mm5sER%2Y9jYVio(e%TH zbKaNj-6o%m5nrqcU57~(ujoF4(vpk!{_qtE>&@QF{*K=}GZwMg6tbwm zT62K)6g8b^1si@_Q#ksx;2q(Z3QI0iyUP*qvKjBZlJt@9j}d=F{0j%znuID*0jUq- zf&-L4Z%M)b74dt-fyK)yl%;|6Tf}cB0qOB14d+hX;u|cI${hu_pbFxRC73>Xe$mAN zdMYh+5nG7Iq7J%bkfSBtK9Ah-tl z0Xp%v34lC?ij@3rjc5nXe_jMGSi+hww^(au&3heJPK0bTE@&deRn~U6EaXAo(xN zrK{@L(Zl;cjjc^oD4oq279%I?>!A^6l7M=B3(^mr>u^KoJX)IQ=D{%~7JwpigO$Y3QDHNfo( zfB8A9fvIgw#iSa)TY+aBY2{MBb3`u z&wuC;V$If+cVs(T@)6016n;nYW#!ie{LSs(tp5Qb(xY@?jULT900_Dk9GJq0 zi+7_&o=w|)|7{T;2WEQSn0Zz+WBQCf;b~m}3BZ9`C>6{q1Y)0z6UbFa3xNC+9fJIi zj7Nz>VP`Q}!ehk$i1-ckrmV9!UD0GA{jbm64TxNuY?$~+kI*C9sEal0BnVQ$vaoqh zC4Wy65Ow6-1ArjI&Hlq|Kes2B4nXjR0R`vl26k?V{SAK}_Oj*#1eJywN6t`zClS8r zjYHnxa7$40110O(3J9fD2{&YLi`Mg>5hZ1|lNoBsetwSVdAoO+S-;FU$Q?T7Fe)Us zB>M@D@5w6)4!Rh@0xQssY!orW^0C5Q`5IMTONA&wkm`iJ8S0mvm)OWiZLMAhU|n;9 zi2D}t9Px#{IMUS|AJs4<1vpy_mUSJBL%(Hts3e}sI@Hc}g)@2LfK}^O1nF!>UOKts zFL@hlw@IEXpMmu|n7iw(T&N(W)0xx|ucT(GI~K-VHQf0T<4-J4s+9*0Ob>(c!jQr+ zj%Y)N(0~$8*7O`4p_ZnsvrXMfZed9r{>3=wow3LrJNFZgJnsWpv{}EQV$;df|L%=r zP59fp3aB>H?S`YeT7p<&=X17)1#Ur!)xf#i8;%Y9ovsY4DbHVP7G@yg>;`YX~M-=sKSEAV5Q{z;_oJAEb$m#%ozVo9j!D4enE(J zA3EAw%M%;+05GQ-?PUueASMe_Atlx~a?lGaBF$D<=z*Ub{DXdFNkp^8gNkQXbHMM6 z#!e{2HsVXfuUMWYVC?v2u;%4D013pEM%z7@wao@mw-H>0i|x}>GQr!|IJSEI@N%G) zIr_?=ON}i38!yYNH&<~WQ)}&ow2|sG<0A@AMXSju1gYz48^<+e#y=Y2d1L*Lu<|v5 zIV{5Ey>J6%M%{2lJSc1z=gf6${gkhtu)lYT;8RMfy{`d;)N&V|^F88)OG&rduS17l zUnfUw_Y7j$vlnk@*z=F{lMPy;cU<=an^!;~J+3>c4`{1;LyQu#^+u{4KN_brku}WZ zcxpTd%wfpp*xZe|LW)1}W8!J{04l-v@}s9;p7Nc|{9>J`J(%DP2XMEZ$tr~d7hkF# z;ze4jWRIH%SYf5)K9Beyi5Y|T+Mih=f3`A0Fx|PKo$AD@I*pqzL4|+2*<7Ly!mKs* z;qVa%s!>b48tUEIK*#YTAEbq6o4fiPHD47yWTJ4*_5zQsJrdyIgqjSR+N zfi#2_Ug@3xx22$b4J{25+ z2A9@l(N}A;Cz4BYu;;N0Dpc3Z{zU=5(BL2zxwH2q6gFRXBl-z$x|n(7K}ORBbu9V4 z1|ec?In|H6$znAAM3Ah?Y8XPl>Yz}o;etwDITs(Q_;e#moI4T{Rs9zs+4CAeNx~$d z=i<rsj<1{r85!a}mPC2z+g1$x(mhar*5>urM57V8AusXsJMqbc&VP0T9(QV~jsd z_^iI7ktlu5HegZU|L=%n#ILwcUAZ<8HgCqhNNI5K@mIpY8Ex0Rb|7;~{xm>l@#zKQ zID%KFT(bid+}N{?0$A312a8dGat@mr%xp5Ys}*1tD04{1nKYRhmF(6I>@aZ9wJC+Y zrYzy$OA0MYOna`Qx zEKsH(9NSYdXgDGBW+TtcD2VUS-B%0O7wcu4VIl8ahL-dSzdYF}g0od%Yf~jAGFHX+ z(%!4q+kl*3dC_dYsUn-LzqRIeW)4MT(A8fA7%?#$9v-mUO_lTca2Hnv{ zeS!+u0Xd&cIk?fmklDX-1QjrDB)6)E4V%4i6aQi98GPnV9y=p)Z<|GMU3@f}U?Gfm z^r#;2>h(wdN3LSD-EP_Qo-Wv#ol83K-4U{e4NMUzY%_wY4%4jF_jv&-RCad_D;i#G={j{#4i!@ddvV%|Fn}WUi zsFgob9j~J9$ge+ejRlH-$)x-RXZ?!cl2 zKp!!o8_nQvZvSPf^%F@o7`QBrDJ`;57NeG-2v1x0WxyQy`snbze}$mXzC)NF%03dpyN$++dl$R{#(G8%W4fMfUCnuZj zFV*?9$)N|R;ZwwkB;)~DTG}}0;olT6mZ#bgD{uDdNePK6!09!Mgd@ON-RMHP4gWHN z1IFwC$++KGmbTBAs|E{?FE(tXpf`I7rXA*RSStt$ z18`^TY+x4D48z?Rbl5n;3YurM3k|?R>GK))z|0*#((*ZJpRnFX9iC=hGs{lJzen;qR?Cl{Q z*fl7k(3Ai5k+`q2H^9hyNSL%ig9*W$*)HkX2FR>EcXFdDkQ|@}`M0WGd#mE#%r@RW ziUEb;U1_KWj&Vaxvo@*X{hNB85vpk5FsYwzwuaRW8wzVgdF1H3^zMQI9y0zTIHf-# zZb<2sNKWVnzN0ai*RyD5gZ48qT;3s9L50$`Ps%2c9e^1AWsk*ced?$7oD%456>?7PeUtzbCA!m?jm{6!8gB%guO&{8KSapt9eR-?_pL zzC`?Ct(AAzmC0*roT*g3i@|-N89woWi|hJ3jciwbXwRNY8g~AO2pXvV)VQ(H6jdWO zr&ge(hIEMdl>+8BlEA%Xw#yMdM|_X?C$DR|XKcjS4@P}&*I`OL_8Zf;dx((A z9yl*z3%-6M>WB4Us-rC|jvg~6aGF)ALT+ks|uuO12%j%C-zl zC4zxme4?$PHe^_Aa;PPOkCufv+5osWtIQogQqd^Ne}Y4G7Gv4=JtmN|NXo)T;)66J zQZyV{jsXvsoPn#Q_oHRxUb|NXcB2vO#%qZsxRJF=yYNo6gQ_+tNTo@mg&P%CT3pz5bCiR+ zZb~Dp70(-%ZD$ndb{Rm`r{(8fD9avF=aWA|fnzP8ObQldN8PE#M^71mDGV{F)orpu ztr}Y7X6}do9sty{uwd-pcok=nCf~8{L)JZHiqFY3n3DnjUWZq#ToR9 z2u`d2W?H#r4NhjOO8z(QKcK7krYGAoUg~1aW4ml1rkAh6I$nDsR)vMlXq|`k^(!KM z2koG3yjgGPusIG%S!7{U(;{G7PHbaqtU^`r2!}$Ry$n&4T&No2Z zgbiv_ip+tz!l}SEPUHcLBBDe5wb0OiWM{ZXbMZu=sH|_e0Z>WmxM4hEn|@ldQuFJ| zFXP{6c=zUXdV@YCinxh#X%d6PS(Gf8(BW)5+9LwrmS7E5;u!7lO=-%b2Cv&X1`ZgB zbkl>q7_q>UH){tRR{E&9;b&}iF@zs5ik|4-?WZAJlsI#ZBsj1&72!Q;WRnW%CqlB% z9M6&^SmOC-c|UbFKTke}+mtI<#G zfCNGASgc5$0oaYW!xj=1yqG6KUz)gqQT|8B^`) zoftS5!X5QmLbU0FIz-Q{HAT&U+xK-VO2H1#unmK&2lCZ>CfgQP zNh>y0SUN=|*|e&*;x!A?hHfaS2lAbxmHa18?i;b;YsBYG8SwHE^R#C#-q989$!uTT zdC3GPl8pm5Sw?(Cw|V7UA5$Xz{_v77O#D+D5*F^2rBwdq<@x_4WssEQ9s4g$#h9S) zdj_C7!Xs&`N~Wc?8I1(daqR;e>vu2G$*UUVF_CV({cRamX-#_~kV`z)4l(M>M@rU~ae1oU9@zQ@ zXEIs_u^|I|WBQXiPnM#|9}!Ox|ABkk@E#?nw8l^-V*MU-TcXO#dh?dgXbO!b?!6GS zZm9(56PL`7evewEghi+#eg!giX(b^K<0TryLe-2nF~lrUP^__+8Flj0MHplF8+34!FnEAko zpKno(-y*)EtE!*_CCKQq^FNqus=#Jb<$R7aT=2&=I3|dwC(MMKw$-G@rsHNc@1yiU#@tB#zbws zv0K{|dcpj&k(}sv<1AY%58M?v_W+(xc_X!dm9!vdqL(4L107Q*JxDKfTe zDf!}7)kb>)GO22@;{wM`4m!BcpQf97jQAyeiO3BfdQTx2ZO3=g2bcN`rd&~BVqAy+ zFnv~OjWD8?q`qo5Wt94r!aCY!5bVw%+O!@;AOo1uFN`*Dr(owKW8D@qscl*msIQLP=%Sl@h28DY$Mwq*v zs)mBjv~B_kF0!+Z)xnIaggk06s%Nib&Nj$qdA65`Z`3Vz{O*WdUe^-Cq&rz2~>QdN3l!Dminoh?Cu|F{8N57xfX+uwBWaHqf`qKSCo;$y@gOs+ms zm6HrP+5FlMz|k6!3MN{-;}~j?#rtL>jf%)G5e2oYmdr`o#2I=l5i&HUxay5h?YO(y zs_4Z>Dx4{;+V$3kTq95HFqJ$76ArDdUi1$9tF4v#Cpb8H{>s$VPI8EhoJ|v-e?()R zxla)(VwGo(=fhfsQNvkO)~>ya_#Yg}FYq7zW({HHwrf(BigTl5G;@+SI9yq#^2npj z3@Y*AuQuy6Ilu$BxS|LuB1h_=((4mJ*PUP7@;5LO)S<3Z|3i1guxv(NblL4^O zD2Nfsg&>V53)f8{o*0I+q1)NOvNBj{o9Fe-F9+iW0zN?pPnLd&MM@5@Adcd0OA||j zOAmX)Y%T;-?N>H;<_HYP*k2fpy-nUO2lK0SP!NTo#GO(P7U z2=he7RZw!h*_*>N)%{=tHa7M^>81N8iD&k_mf>sxAh*7b&Xpc9XbTjSDs*9CE?E{n z+4MLsdJHB&Q~_dJ#SsP&N4!eFh%FxPfnJ4E7Dfs~@MrS120iJ;NP&L+IxroG786sk zZ{|hvg;7JXdGmpbyplETte{<)PVpyK&9m?k9NO?g7(@z;r-)yTR~$`Yn?T3p zXG-f_DoM(F=sne?^PiE(qakg3PfI>C&v`*lH+TSiWC*Y}xh ztEDxfvPB!#AW#yTw}{^(egQ}9aiFhI3zbpzxsr3lOYO{(mBN>R!}CWf9xZsE3Wp~v zr4P2u8#2@jNOPBZ43V~#iB0aeP~alu_e&1LY-D(6ii$jiFF=j0Nz!gMn@akPc2mS3 zX_P!M0{p_FyyHwtd#Tj@;yd8qpD-Mc)*HGr8R#2>{bT(IYJ2Q>Cyp@vi9ftp9rzAl zguwnIV$VfNV#opGdAC8GiT5}&CtUQPBwPx;TFcv-16K^3@OV2=uQd4s1?A&r09Pwd z4;Yh-Sx3G9r~uS+q#y8)7qinFHX}M{g94QmwZIr1&ECd=J@D6(C6qFHwqPdx&8t!G zZ<~EAIz_#85Huz_txOo-d=!A+ZQhB9*d1H5gVVUJ3jlrkr!5bE=8jq$gV1D2pfGj8 zo;`8rdybma=~f_6K8rfm2C1U&C!c35{OlZd|!k30z-S}ohqWf z8!Lp*mZohq@_3Zk8{Tfru~Zh@l22{=fzqHr5XScQENwOpR}NHMy#R>@OZ65Nx^)U& zFlGL;6)wdBwfu?_nxvU?=7{`Y*-MjBhc!rv`b(5-UPS6KW+^%*BagIG8=o3;t1Nl< z4SL6W=0#i;H!kq5K?V^(uF_&lPC*I&)HW*TgrhF_3xl1$ZSPYLCT1u`DyJ) ziHKt+<=UBxQQ|-2NncG5om_l}6&gFICaw`Tc9Ll#7F|# z$`UIj6usq4J4=W1{Hf;kvwuf|(!qR4c~Zb6?=k~Yjkq?L zSprjAd;TSt`?SfSmyAPE9d_c+D}YmHd_m^{z%6SD`5maEv(Ea;1m#I5XaF3~=1glA zf1(mFt;<)wga`hSRK_LS@32_AwE+hL`QLaKo$|E_d1u8lSGGF$(?62r_xyH&EjOIr z7CqxO0-f3Ini|#s{ClS}$KHC7Xd)wiIU@>Tzw+|JQzWt zl3ws81rgaf3@M-xpK&;0{>#^8#1ZN-?+-tkt9u7S40e#toc{#Ief{O67Q&#;mF*{cD;ZL`EoS`9j2k2Xb5=xs1x-By&x&Fa*xJnlRj3TI-PsB zw@z9KLnX-Ft!Yfrl?706A?=@d5Yb^f?muS%M)MV6$+wr1uQ=(1#n9ppY3vUQ2p4+V zr68{}1T@Fcm&ASok2R&EJZc-s zCda;A!{{MATDEP*qLn!JIEVzq%KJV7-&K-$f-~xQ=5if9gUH&#`6}NSxn^Pf+eQ&P z>?a%c`LlISmKNSG84EN+n;lr;D@!QBFfVICL%VU=g#ya@;Pu_4k()_=~-6Gw7yxh0+ez z?=5Ramxgd)^ZsCr+9TA&g}4`M{?HFlK^`%XvWQx02_2&a%={TiMo{X;efcfH^F6c^PMT6mFyI0OLj||ztKc6v+M<&!;QE89IT!r3NSyq zQhT*H%9a^;ih=4<_?o>^Va9t1EH_j+dP2&@M8+Ed$bEo%$vX6`sydgHu{LD1?q7EN zp*H-R_2;XdELnwRhn#u)#CAwFHf)OUb$#WlR|CsE!on{E!(E2^0`opb{D%ER%DGN< zFV#oUCy=l-)cPg{G%b7a8GZal8K$u@n_|LuHjQ?oCHd2OoYn}p^=|vDBKE|q4$4gp ztd77-#BoK$ zUzw4r%620ydu2z9P3Ur8!+{^umWqaM40ZOIibTJBBV4qAfB{Q7r-i?1^8hKsxw275 z;+=?y;iEd<#6V49oHFe_sO)$qMO-Nqt`UDm1iRdz;@EQ-PEvtsRHF-du1WEkRa7u1g{yOu5wKn2Y#IJM<7rtgE>k#}$#8J%Rhb(~nLLJ03^+NOOhL5IOaN@HP@Ws_!0qj@sLtL^IdQ1Dx+O0HFYxn=ao`=AwZ&t9;!thef5C1&n6J@ppE-6D-Rc`f>>8_7 zL4CA>W(6c@*8oI0cCdJ_6!4y`g5MfWN;;uc^x4^6Op@OY?XL!|# z9qBmEZ|OT*|N2^GP1_AvyW!-8x#PzEBr)F6Xe-6=i#7g){e}DI9ss-tSYH}AiIRxd zOi0U9um+)3Z~sm4HXt(gn9+s=;d{g$>0iTd)#jc8aDH1KfTct0e@FZp@s;=S&1oC* zt995W4y5Ji?hMEG5kJi^7c+Q;9mx^I^3mXjAohNz4i(6H$4v@Ll|GeeYK%#mht|~O z8>YA1(mQzVWROic#f%#NutreoLK|r++nITmNMe7Fcw5(iN|>W7h5rSUMZ_l-tl?}Y z9QeeWEIU&PUP0UVe4x2QC!tuc-;Rg>LBu$5E|kdMY?Q%9 z5a;VZq2gJpxg5xkS3&-)|Dtv~QX1E-*Z zKf0L^F+abBWp@kV{F7i%U>zkpvJrg?z{xL0Ik;9>EiPBPyJY_)HVK3O@&Z?rcx?0$ zsYhyw+fT&sf*UC;8_CQ6e|>ax)Ng%YZ997{@N(Zd2RN8KL3(jLz#e&uQ270koMqCi%?xu$PEK#8+-XbpNvA-xwv)ug}nefd0n6G@N+v!jK3!SD|#g4ds*d zV|GwQ5@-y@?dAbQQvRK3aHXqKM@nfF*4+Vyd4`VOgPB#>qk-!=l1;Y$J#x9vcC{r? zJ(vMY$GxF~Xsz35#*Hi>s%NZ2X*Jp8;V)?HJ1*iy%o30gsM&w6dwC*l2EK9z4whLI zA5mD#{eO=5l_P#*&j+0GAEs{GE+UrJa?qf+FGvn&{$_1@wfpsVbdlBe>Ify2GOCC5 zEb<^kz0$UMg-EJQQW~bGJgw)U31yW)mzn##0T1>(%Y%PkIik+YC)TM0^oboT|IItS zubf{&!V><{Wo|u;>V5IttEh%{rj1_$<{l9KmVI9o%Hrg{+gM>Ww=DJLpKA@R{6~wE zdSd50(n1vwCw3~)BC0YVIx|fEQUXkM&7_7jTjC++R*dd}qakV22NaODVR2@osqrO5 zDanF!3dRkvCd^z7^*muuQu!v0Jj%i}FhorD+2k}mWXrY;?o-4+Y#`~)4Ette;+WB> zGfMealmj|Fd!s65{Gs&W$^G}d{9E?#M|HCEG42S*rQL``wYVT&Xr90zav^2n0tyj z`l0;r>0gxEdnvUvFj#*yPk%P?PJx&HV&+)bjPUa&i!$Rpe%hg&X$z6@y{&cl4LofC zM#AJg1sUj*sgvf-`ya_h8)TD;zdQQAWF$yo=uZl_mxvz` z&&>6|am`DHVQPPX-pWS}u4}M1l@cGEAKH@qcOrkk@k2-2Vbl*eXA3Z+fA~QPT#)d; zTOF>p{C(p9R1wFBFUI?ctL{?-U9BHsx%qF5f~d%8O2D0ZObW~>v4IAjp#iLP6r@gu z#_P2J)IZ8sDtP~QoXKF6XkN${NuwK?!0Rp_ynY`;d+~zhOp|H19ERg+=h?j`}w6`ZJ zj2_nTe*}KNam|zQCqX++C2M@r_|>Ur53pNoy+>c-YL~gQKF~QnENedz~zgefD{Z z%0Tv3sH_co&87qm=C0m+{WAbPk-JT1$c5p?`^_gRASE9M(BxzrBR})LJ_8jqDre}n zGveqYb-XvDwE8TGSQyhkBTf-t$TK^3^4Ykt4?n(W9aKYa9;@U^+tK4-l3`Wd&ZN^j zjPAzBl|H+;tWPZUQdoH8=Sm#I{1JTUVQEW{<_~+Pr{8~4hnwvHm9S_No^+YL@n9^# zus%S+1Iv3Ac+wXf>mzAFx497`KA|=WtTG%s8;&nB3=rfslMRGH2hsolfB;EEK~(Tk z&M#$mi-*ot5QWJl8!Yt2W&)I(aeKY~lac172_jS+oZ;A#ACmg^W<=&V=;j{LWOs zuSRe?mtE%-cR)sSOc?xc15Z-CesB3eP3ZU|5?;pL^yMQ)*0In<;;sgMgKxe;R9xbYD z2!VP|>yafKiCzZ|SbRi2jxY@e{KS(Tk_s@?^OmU%`|0WM+Hsv86H+!682=?h@Dfd{ z61y`CyYX{3D>WrG*onOUh}O5TciT<`fQwDC{OgkhfC2;T4A80Yc!fgPtxI823miDT z2MR(X8D+(iTrw5Nj6r-d8QYcNtARa@Sbz1A1A2-0lYxj~H(By@J0#+{51nmnNy}mE zK^jN1M?d}d4bb?914v_LE#9!ohGV$lzyk;P2{-U&QB`t|*+4-4DcQM;y^`Uvqw z@eqR@p$!LA5ufMhlnqBwJM9=zW66RlWg0&23r24TnniNM%i8> zevSA~TgZ`fwj&SU08K;e1aq)@_?Cso=%R2nMjHkFJe22)}Vft}Cf zPVeNzbEaY{c+JSkeErh5djc*heXXn@o_s1sA)G)TGsDkIyW!bTa`IVNrKMAg1ms~&!0OY1nhzX_v;F%6*T^Y| zl1Xfj5v|oYcJ>}sktfo)9~ z$!|e)t}J=3Y{aJ!+(rQFj0HH_^WRMWGk;_N!gV$8vuq#$-=G9*Jb7|`FEbv&TN)@J z0c>F;_qO*5s~S%cU#z3&iNGqL5*Y?qI&n{U?;_&a!1|8RHR1zv4A7(bQ40Jc;#)*# z$<1sedHzum_i86uDxjYGZ#EkVr9oa$|0mp72a9pR0o0HhiHve0Q13`6&*TJ3p9_mq z+Z3o&7d1Y3GW~~?)NffcZh^r)M?7KqQky9ZpZN3FH49Q9WKqhYC^=$0Dq?O3iL%nN z18=Rr@|$rp1Fh-2Dhz!DHuuVaFD21A|0b!I)Q)Nvm<@f-nFLTvxaax@yy~7&%I~Jm z>uoa5Mjg4a!56QPI)MOpTEEUvkFc1)pAl2US0jm3+lh!L?j>2ayA5CQyA#Jix)3Fg zwP}u$1z7ZDpFz4;YvCC%i3*-Aq45{G>N8$a7)ctynn$nsuel1PuttoQ!s{Cxpu&Z` z@*xWV`-soH@v_GOPW;yb;9uEW5t0}r2Rd3oP^{!5yEkD>8x`31h<`@>j`U6NyGD}O z0c&FOrCPnoYxaPJD+ACABUU$rBoz-CxSLlVL{%X{nP1nhgbb^}6gZvdh=M;+mF)mB z$*sS{FoHbbYBch|sVK|`B+q}1_*X;`@fAC%td6S^_cnHu8&IED%k!U6rFOJ;DlNGi zUy}|&m8qPpD$ub^hZ#qE=TAb7o%>Vx55Htm`p}W&dS=kA}eDRmVt?|+nbsrFh`AAIBsHqcV-bC ziC86wQ;n?85q~pHh4Zo!ko2`YV$@CKU2dJ=~qjioPnEY{L$e6mho6RDc%3nuX z7%Du$JJ|EEZsU^;FCeDG^j~?I+C%ytnGFIpj4oRa>T4>R>If!n#E&e63~uRQ?sBB$ zS6U50I^=GB00x-JiJbceowbCA1}>*vbASb+(ZPWK#Ew4jO<}JpbKr$_sFgOlyhx=L z(?>GGuvR5!k0lT6IhZX_dd!xfL}@*8R{ubf*V-)NqxqSf#pxsd1d8=Ix@Tf&q%EP_ z8Q9f&wnSlGaE$77bDHR{o&Or~3zpMEXO>ot>+qqM^*d=)cyGyfo~(gZW!d7RH^NV7 z#;%Yw75|LDkR1koTmz`V5SvJn#uWTBX<0{jOgL<%wo+X0`Q4kfZ7rVT+nPo%Y$Se# znxL`1!qQ%67VB|FDiK!ry2^a@;!&Jfw2X0CoiG?lJxZVN@xm0E7 z==#wS?x=R<6j)`{MrY$wDx=*b4azC_8B8829ua&rDMZ^s&qB2v?<|AiVT zmmEL|*GF9683NziX=>|~*#N;({E5RZto2jwb^6GM&t}Xsc6Ow?Z6_?&kDyFVny=Z0xlJJBanlDYZmRYp4UYNnPOlQ34HKVQ6MwrCFCK+-dCqhZI*A1e(bT z7;og{8x%%Zb-1;GTVVqPCd)>WUb5NjcV@Lqu6!}t>|z{i?m4t!S9VL;oCL@n;dvv2 zRUP(#bi3!y6*T2PcA5VicxH(t!`=ox;av)%a+me-2!QYK`^#B8Q?Wakk4~A;qgy=bq00yzYzOZY_T35N_p_ts)r;`)`2@5u?a;eVJ=MLcdWcLDKn6$TBn~ACtSX858KwGdUz(eJZD~PHVk^P6WMWQRle`6JdrHE zR9Kzq7<$6!uQtlL%F_RP4M3HG0}g0rGbfWWzp);pM0j+Oa72cs03|nC;Q&>U&k5Md zUZ=c#t^kh$i2X(+m^WplSCi+A;FTR0m{_-I(|f8zO~6+=+5%7TZhflx>3Oi>t!m*I zts!3S>50uuHbDLA0~Yf=;(tZ_N5ub4gjfR5Z|hk)GYII7-x@Kg1M#e*yJ?rrk_nbq z3ISIh%tq<&t&X06u*&*9Zr00p;}*U|yzz6F%^F6ByhL({$=-f9m=t?k!m?x8(7)nCIX!z-Xi`x;veJzr6p?}9=_t?^9*KZEv7TG zSZZrz3*y$a*0DE6QuJ2u?d)_lK(g158mStjUOiZ0S(i^XRjeVuYO?!ZXul^rvto-h zKW4;@9y0rP{;ng5s{mbtl}LYpfb;t=4QLGZCnp?WXEn(8Y#^{i_%ArJig>WM_n}Uf z(yTHyEO2aWqq+VsEb0 zjw&K{vVdaT212%Uyh@$}USLB;c8>TX;-}3a+_N4UO(P}QMiNpOU3=y&J`McE*(Mu4 znJ_OgK+(__Bac*^c{C0&M-T<)?qDl)CF0fW_>MZ*G3x{j((4@o z|6b6_sbg1e>$yXZ`bxbYeJW`on7t&}~GpAJX_au`wC-!2f9O-Rv8Jee9K@1KDy^rpO- z$-2T+rSVY)v)s%M3T∋s7rgK)2f5bM8Q%(vq(L?ZBNk{LP$6ZrQD6^VCyvW3_h} z)kXkT2J?uQlZz699iShUmd~UWr6kbSrl%h0ozOK#OP>-2%7I~g1?#^<8aS9vw=`k= z#{C~btRp`Bk_tP|=(^@LiynsSuav&zj;{8`;At}kDN_>#R^mT<#H2loa$mW72Hc=f%DID z`hEe8DaK#9#}fjpa&3U3eQG)pTRXsSR@qYMP4@2wQT|}!FR4$}PSoSobJtO@^IwS1 zSDRfq@fQjTm0gB_mWTgj-By8!Agq#5f%20YK!MsvmK&V7n$3G>()WAjI{b6urSzFv zs$x3KNCulb@WKwlH>8ixxZWEUF;eTxGvLbFj)F)alpqcCh!$+O20-H6I$)|s!(1RG zCwpsD@+nDHCP;=w2K$aJUwJ^DFm>^=nbry)K8hdF{Vt>1cuMwZ9xV6@FT*LX6SlUMP6-oKVJ}-|@ z;88~~z!R1;y8r2!M%-~)o^XfDN2KiHW^)ssNPlL7ZhH#q(oQt|;i?7@X*1xB(^Ha; zcd#pa?k9NwrP`0_o1eluzn0`>2?iS+rn<%plC&w}CE}k{0C)eU3#Q?`-uOSb>tO*gl1DkA_S`gBl$o>+)#yMTZPrz+s1E}tLRSm!`Vc>R0{I(L*(vI}~HwsHhq`#>+WfgmM!S&0}CA)W_ z;9nb9T8ihNL4(yt!{`-2T06-twmDk|_wp2v-p^dvfI$^u3r<31$_)wSa~*yKz0vaJ ziK;-&d%^lm0i8Bb)#Za7hS|LOL;!wYbKr77SJI>jwnc@32~Rp_cOkGVetKb^9!U01 zxUm|Dvr)twtUxsZCI&kXVJ%eCX7p*;f)1EBSSlCRZ zkqxeE0NG6-yArDtt(7o=n{S52}3Q`T#6a z>JIf;XYF7+sLQi?!fgZ;md!{`RKTkYw$3>M5;U3&3SZc3v5)2Y^ym!j>^%-0?y|I; zN=2Lv5MJ34>A1Z;DuB{>jX6Uw%jsDN*aueo4b4_%e0pJ$yvUwkSyRu0sG2Q0?~R>K z>X-3YN+Kq_nr)d~^Z@GI{Y&>7EXN%g#t>cQr?RB0H! zaQHW5)Qe=|8ziwho^^rBo+U}2?nPh)!GSRFA5<`<%Ie{^av7m&(=BZug|#tW@CyII z@yJ2&fCDaZddp+H*!#)=pO3ZXq`pt7i2n=y(48`ifV z3CLt42WEK3)`W>0jW{|(%UM;c)`|SB;a@uba-CgV#6riN&nD^;`7~HAdbf`O(tc+P zQ(`f%I9W+@H$Dn0@1&Z)TGzpZhre;>i3(}3$v-!Ahw4S@Ag~u7{3Vm&YVnQ+IhB)? z{GAe8&b^0B@IubK_|P}FtmTFqj#{zKVEc$45&w$#1+C)+QgNboDMgLiobpe}jfgHU zO}Jgq98#}ueyJ}0Ur3BSO7;MoiQK!i8&3|uI(5W2tK>OY44c;(bVbVP$>wXsKO??H z)D+kE{K|%kCOD+af<4(N)CRvW@c(WIq4X5NvA<$u6)Kg67QEdWhBfdkBdZm>1M(Gt zTw#s9kzKsPv)?fC);eQaD`{1k5UL?tN8I!6BLiwN z-InJpAq=&KyMp0AQeG>q!PHEMJf5+<&N?qs;r>E zkd9Vv`C^T5hs_#>q|-sTZb()xfUd@}$qoAtb6DyLlHyZBrW?&zEbNkgVVgq_Jn;+~5ulg=!8Hdfr-28c>7GET`!7rpp@Q5l3;m6~$m%B? z>dBDGfb{Q{y^vHtA;s{4i%ZNn!6NG1CM=X3QfWz0gBuZ!y;8x_N+;_#raLdTt+&>y zr4`^)CIJiJ?}dXlV*HEV{O=K;ZKdC`8iXjcM|-D1E?mFF?*@EenoCw-2e^$ zNPJeU)4wpct{xlZ|5CnmHsgK6{yOxvkCvlWmOHAUrQEdq*+!6`fg8`9?#dGKehq-c zJp?gm&-$z9Y-coQwGQATX;KT3c_Ic(aCyfROynnbM5(f)WZv)~N26voo?ljQFu_~F zMI@f{7NI-|9qok5+f_D5sN!GB^(>&JaIDn0#8mdi?39!@lEtsJ0X_ar8SagXhK0W~ zKD_fA2v)D>qGx+Y{AdGC)EtmUz$Gp(?Vcl#^vO(Twvk2~71T?_?-BpSmsPD_l9*3c zSST_FQgFn6M*I=+H^|IaOF05$GFnr}#u>U&e%l$TqX@2xZ7cb7!!7%(BO(6gVA|B= z^hjSPC9kq=sT)WYY znDCJdc)6WL;I&7X`4U?nDZd7T;*y=IhEsi1L={$%U}s<^uXcV!grBpJN|Qv6OSu## z>Kk#OiT%&!eZ%DUztQ{{)?zTPC3M=1lKbDYf6AJLA!Dl$3nk~T!JOLl{RbvIuvHVZXN5(USn=z+jlH*`;e~-BN1jNrJxD7y)=y_+`EQTd zd}qoGfv!Zp^4HS0$Y)3L+`yg-KwyBNbND41_nrUz#t_LJslE90Piq!HG}aq2Nb@2p zE6W@{ue1ip4HkZ?|OXtm>Mioq$w`%HF|Qi`a#Ezkc0+wLXQSKz2r;S9Cre z32E2thbro&mC7ONgpLBwRIcrwY;#u9#_{m>|KriWzhac?-6q^}{q#etyyj!djJC;GXQb7WC=`r&coi?z6Kk)d&=Huq!f6M{-v~b zhKZ$Dj-JcVW>}dELaC0KH?xtEJHsJ40$32(ZxMg9*KfQVUYa-z@o!Sl#z5f6)=li{ z34-HgvyQ7ea+nI_1^My;m-fC+{=?$3*Qi_5V3wH9v=$OA>-Qt#zxZz_B2HnifG(j5 zANaXs{d5KvCbF~4pd|Mw+399||Fh{t4%os{&H4-==?&}U9CicPt0^Oi1HbWx#K03g zyP99#bHw%amUWia2y-$hwGjitl7$n##wDrhW7rOW*NERucJ>bAzhlaPj{2}RVO4hv zV_tfjZqq}pynlotoMG8n1$X9j92jeoD>>B!AqBz6{vE6luC)1<$t}$I8fiRE5GJ~{ ziWqiVqbHFRM@!=LpL+I1mi{9HFghVO;!|UJU}0sPgRv`xf4{PS*^&4c78RD2KX1Mk zSP)}?;8;A2cC;b%44W)i%{3n_Tr9evnZR7x$mk+F|DBaC6v=!SudrZZgjPjh1EsPkK#aA|;a)ZXcqCW|oeg*sg&ah{tMH?LSayUORJxr4Q z8w*ida-1tsODn(L7(6n-YgML^`o1~MLMqu7cU!~R%J@`}FE(S?Y^{QW8L4x5lSTXc z&M(^g*N7*}*RLEn*HoO*h~?;MN#B3qNXqo33FYb!m7l*%dS2auUC{%nIe)f)MbPFeozQv=H% zu#>Qc+sVS{TULWjY5-S2a6`P28xWaZtsTHjbiBe|maw$WMF`pbSf@Y*GTS3KMEpT( zx#Sg+Y@-oo#5o3O)6q`vaQ&Or1Cf^WODcYDSRCd?x-fWqEtDblg9wwLe3rU(9h?f~7hABi=Km`v69E4whP z?_X!2*;e~WmG_(7w<29YL-XJ^t`fvt`^y0FF@0eDGm zeuV`Ma{ihT4`+%YIgJ`bmPeaeRj^u(jS-YgD{o|)S+E6f{~7To^9;ix{&O$x+yFo$P+Eqh+$F3>bb|t%Fdmh`Ug5tUxIoJWJ$y8cX0o{1;`BW&>rAMT zte={#hKO%8pB=4*uOOcH>pl(@zK94Yp+kj5`omDyWrx^eN2J`7b8gWRdJ7#zIded7SAWY$OgK@6VkR+pWB z$$FUz^Ozp-o?&4M^r$mJo@D=JKaCZe1N+mVTfYCcDPs-7_0a&DG&4*0*fH*)g(cj#BKR0|>U zi1VvS!nMpfg&X!Bq=r9SEbqzq*9`&bUw^l-H4GlQFg5zsCOYR1xa0-~0NxQCsUFsm zh52PY5Ku^Mp}s$}R~iUWlZ3?POW}h{?daR9n5=om821JOPuWBmNo7<76yz$YolFP- z=k@MSNVn8#;~u+0!20rUWMmWkV#7dHK|YwV2?511qUhjcWuuK((FOg$5D#jHs+nCv#L80=(MT zoxo8(^0hPYqRqO1GAr!KPIs(JX^5vn>~zewP%5hLM(3(wPZZ!xFv*L^uEH6t8T`tE z%hha1u)UzL)i80#&&%sYu?w?UX1&5emsiOuq>R$tir z$@)eI?D>Rw+}4dC%78jt^oe?xdfYmr+V&X&cXD|)m^b2&mL+^Z%e3)XAmgu0WWHEi zLTTo)GfbTX(81;fTv@XRQl^Sjph){f|N3;Wa;_x1;e#%8HawV!IsXU*Nx~v8cP1(> zN}Y{X4s{_NBL0Z@9vP)_Vx+oeus1%Q?mae#svJ;obb-SoXCT6 zC9vAPD~l{@THgWVI{Q0!^SC>pnl-2`Z4E2XOA?rKsI$cosn-@Od7_BbTA<21fp-|) z2w-YJJmbJ3eMRcN?rbL*5{68CcYmc-m)T+IPBMm ziQOA<<3D*-;AjGmY14oXWE!tz`;vp)tW9S$Klj)5dl2XaE0>CCHtKoUJiw~%sqWEI zau?mLI(Qf{ne2Z@=$X;G-;7(818O&gL?tzhuOutWdHxCHYk;qV$te=!D=8e_tf60l zIZ|!$wgZ+^XfsLRcPWopo=ss?0l4JKA9iv# zCN^lO#^3N91J{3B4+843JZXUj__G~|MT|_b7ICy_gRa!BJw$e>f|p9vZ4DrTa5ZuA zZ#LRVCF&i;;n+-qJlWFET`U7GHVc?eALa3R= zI5F*bB8;>|)i)Y}W4TL9Vl+PD_E9JEg%?I0EOTzm(M%jHF@^U}WeuP; z9r*9WGxfsk*x+%EK`NVktA6{L&@rJwylgIhVJ7q*@lW>o$Q{i%*1;w<-2>>tZGDUQ z8)0Ndu+kfbkQ>WYOoERS{VGzhFNK+38Db&=f7^H(l7M%sdUk|_8+mY85*%ckxc;3iX7(KkVVpwoflhp?I9s}-0VDY}{dp+BYmSpbxwf78@;HYI_W}cIh z$HCUQT;IJbT^k$OR}pl6LP2d*aAyqtg(O6(tdZk7879q@h%Q`S0(Wa!@Mw^2pX%ZM zy8oJ8>Ubl0gC|>>%6=zDE;d=LDb3=R+k3Sxvf30G#ikD~SeU`ELjkP$r?rXYGSmWJ zHyVMybQ3JB{uJ>`!~u)Qy|rz$Q4Qzjm1o{)BASzb$jd0=MOe*#Jrw5vS ziFhVxbS&78v+$iV#D-E3!Y$yDt)@(9=CXO|cQ3~h;>p!9aJur}>u&UxqCgmS(f^TiL zy{mT-TWoukBC=$osp^SGp!68=$+TFxl1=>Rl^-tb^%uL8 z3EoiA=oSvppq1rYi%WgB!S07_6?6bbyOe}5QY6&|43hM1pzGqnTDY|T5{Nx~G)yq; zpf~pJpB8FO?F)s;>Jt#X!-vkKNeBAB4lG-VF6PxUxCt5c$g%5)yPpx2lu#(oCW=ee zUy92z#mX7pK%_IK(Sb4EiJ znETNts=gBg1Og<(iYx^#Sc3^1DS*-28;xd6t=&Ap0l!(ovnbnr z#%yqJTdG3c#z#V+7MMKZaSFWm5o%+%iGVuD&0kpLNA_a0qtmW?xe6XtMDsfh3CsGu zM(iSfg_n5Xza|vcOx(VjAYd2q6^oHh@KVDuXkEApqP@G{Gne}PN78Tr^)pKIbkuT03fP|O-(Ya8n( zscoXuE(?H?r)a504n;{qd9+ZI*T7t!wEU%|FRZeeoR1KkNfIGmVFyGwnxNxs8`SU< zc?LLGQrf{eANfy{#j|nKenbMaHu`b0(=vnJTH;Z%1o~~2D+`QgPi!lxjY*z-zqnhG zf>{d%Cl>k9(&fKC_gtU^puE~JEpGp!2%GtmAU&A1ChdE$MJBYW9K{1WlU=Hgf6z6DPa#+Kg<$$mnc zxxu2B2DfxBYb?sOWJw0B8p>$7U&(IujB#kJk@K(TzZZd95)L;>QO~8l7IR!HK?`ui z6&JmD-wQpc4|Z4cx*1jmR9pCy?rws;oeijO$Ke9P;fXu!co50EsJfC^yBBe!?9a(+P!ZLATi{OINd4V3l`)=g2^N+q9xNmT!c_?2Va^6+~U z9i5FyN;pK-SeMA!*fh+A0{N2CSsOZf!H*v(O7>XnVATuRV{!l&1h0m-0drRwVL;R{ zY?gsaaOoo_{s`V(q0$v}Y7+%=!cVB|BSX;g867NCVNwK<5IK3|##<4h4#xE>+1Me& zY*?=i7Ouo#dlsp&cyKY?oY#O#UZ%1$-#VB-uR&}NR)E?l!Y@(sD=c!!>thD$`!Crv z(t7}?jhiTW+#{&#geNIrEAKZ@SlU0?Kg&XQG<7`Kq~B|X;70U64F*u+`V|)>yH`g%Mf^v^KTJb9@`V}?yRdTa>uD5)l`k^_e#Q-q(EA$* zvM>(0GT=}XvIED|eI$c>iTGi?f_kna%Q0f-uj?dgwyyCTkXam}Y;dGa>wO)AJTgr{ zjxbmvd7vE7{Ui9(q)J5bckg7>14^q64q9MoG}pCk%FoTqZdyTMLOT@(#07N*qLQOu za|9ZeYJ^nT;fW6n6U;NfiXxtZo+KA7p|e{v2kTQXW9rg9%8e}2z9VnE$yIP-K?1g^ z$080q&dfud*Eb^3gY|~HqU+N#(ns;@Ctmo$aj-1xbX|_=*2)?bVmQBiEJe zXl9;clRf_};#TjH(We=XM6Sj@`g7$r> zfh>&}KQkf5jIiK^%{-chQaGx`A|E&d`}}E=sUZB*?E3zgu(pUtcb1b(4Z$ z(zS|z5pb;yP2Nw>nYm-5VN%6#IlwaopxpKJl>DA>CKnu7Z<9$c&?HF!sw18u8H4Aa zFlLoi14YkmGXTlvl#L$n=if-`AB|c|KGIO`P4=}j!mdL}D{5q;4krwx>Pt^ZdVm+L zGWZt8Ar}Of0yL-!cyB86Z3KNK{R*6(vdKQ;Tg1P3?{l6+RD{kqIuZ+Ta7|7%(hK@M zD`E$<(PwHM1MWa~avkdlq)cr#E#0Gu{&{ylwD&^x`wM}W<2;{1&~q%_*Hq4atnlHH7g%PIJgNu^Jy zMmi)rlcj8x(aC8BFq?d}2SFRb*& z{j2r%-R9vf(v%+&Kh3@5KBzQDt0I1+LkMci(UVTpmMJm^vi0u~KLERD#DF#F>o$hy z2|8l$MaGWA@ZP2sZdg&r-iX4H{e?C79M?(U#IOEA7gWK9s}3x2!bS|}VY@yN17{l| z+4CBXM|(|QviOZL>%A2|d#hk7{mLu&1=;08!%K)^>4TC0|@w+Urc%PXk*;o zO$b=8UC_WB&=%3>a*AIgzQbRNV9m>jvY(n7LSwb2Jqs~eZ8ZOg#Q2V#J_6Tjli=)4 zmY)}aJL_@j`Inm&GtMTmAKZ~hf^?)0*+9kay=1Rb^GB$N!~qk2riEsYKbQ#ONd{0uAxuUc_XdR~ zyR{7+a!qyjXcl|nCuTComW*^`o|o8$gLSS2d;Ug4zW2!8m#s5_`wWq{x0-s`t0qn+ z3j6C*5$kt6yc0f4ar`q!YP$yH&6E?5qoZE}rLQJyxUZkIhRfzD@d_NU$jcc#mDtaO z5a$~4J>n}C)~7F!kg$3~2cRPC$#RUHZReih9`R8T^O(L-{6GO=#v`b#f3Rrr*J9{+ z^^CC(maW7Ye@n$K`Dbxpl5tn`p8SaTFN8$u4$Tjb9^pWUE^!KqczgaZG6$@piL*U> zV$H>7gq=Q;p$D1fk~RFNA^qB_F9jYo*}~aWa0gRSDT(jwO@sCN;=RTCz!4~w5H+`4 zGnvD4uK_G)a>2o0O$5FtJL&Kae1m)1dLYOob?~Gm>_or$^2BcuO~iljcIAt)013F311<2IEiZU) zjW-h}-bRouOS}Y!MGgJm5&vNWs&~xW*-qgDqgShS zfm4xlcr{?LrS>WNSdiTWxUMn{X10nw@loAs4Gev0q)eWEb);{fBEnjAN~`yMCr6I; zFun<=p%yw-b0;Fk(ZIk4|0573!bAtXVo_!rk=SM5FZfz*qFHH!kHo{v<~Z^hNA(`@ zN5o7qaJNNxE3?0YG?j zuvu*b$?JgW9QffJfem!gjFcQ}9&A8Tc|^y1+zi6AX-1ZZQ?OQc{b2gUkY0U;LUmk#{#pWy$dF+wMq#R3VO4w`KZ01xEdtu>TS-%mV%2Ya=teGTXBbBKN7keOpUKnxFaYwlZTn6_R@y#Y4zS>r)wzuD;z(imC zNM{Jt{Lov(XC_To>m;_P%2nG0m&q(bjO6fd>hSWo1v8r@3LMOeuGveuB2~f7b_!*e zJ>Km@711+5y+Y|wT0++0RNk3qmllDQ0`KSoQb$lj;j6Vyt9$_}sDc;5d}03#3?r+! z+e20dR+Jg&q&wLpy56vJwb=+Zb25GawrAp{RB2Ce4DypSX)e$R+-_H|FeWyQPSACB;Zsgc-I>{cBI= z{m;Pv-eP03UjHQ~Jy`4P4D{J?sGsmI2>=u**LWYb6D~$_fm(UO%&u80N&~o>Sc_%N zmoB!jRH*ytjd&9ayE6<};2M>_seqAVP`SFBRlihtzCnamobjXvwL?Brrrn0prUw4w znhoMgC?L)G7@6Gj2z4}ICpX4~BzIr&Z18xW`RU3C(;_>17q)S?d5jG1>Aw+DP{y;?Iv>NUK#;>*38_^nLA4*mtxTOEQ?*X^v{S(#IN>RB9L!-wn(B*@a1e1RUBpx#Hqzyx*t@^27m&5loysXG((xI}&f0cQM%v(*b zyTJ!aiStKAaKY!nyk9}use^U3snhrM6H%=i+e8*CHF9OqcU}j8D=#cN801s{nLuX0 z^sh|EU3v7*9znLYH!IL`biC5>cczBlzuDM8)!XV)Dj*e`jX0A5T-7ZG>t^|iAzjw+ zFW9r29m9gH5*Ju-Jd4feSM7Sn~H|6;U;9`UaKvwTv;i^9%R?$a@ zlSM*#t6~#FwKP%|z`Y}$Ba9cSZ1(wf1}Z^En08;%(T>H-H-E7H!`hlH8e7rP3c_`E zec5>r1r}sVEzot~PsZ%|l~K-CC@5h~I;hpwN?oZ6Xhy^I2Z>l#&>QU5PuWQ6iPG4s zy)3q8C8gtD0O2L#zaoBjA6{6kps-%D7ict5UrW?{+T_q@4#!duJrL6uyL-Z=ED`w8 zK}Z2ccs9CNCRW)@z5ytkr@+c!j-+GKi;KKcUbtVUz%vB4Xe(6^XRDoF$-mq6F)9tR z7Oa0~?q=jk$Ia+9;;SnF(F2}jAZQ%z<+>XIIIMq4g_^yI!p;PPaeecdx_gOa(lsof zk$%5hc6yc?*-|n236A?2@k&AVGl1&6DZ^FJ%;MWf`lqoLXA^D5`X^*_J&zzF<%+_u zT2rVKv7%&D@h6i+&xrXt?q$1^{0V?Pv!XX!{0_4b$h+UUL>#64iGWgV^NW<$B~RXQ z+dHh_UlC2j7X#j(txtF3BUjTi1hfIrKOhh)4QeIpH;{EmdMqv{%(e7u{h7Wm*;C8T zm-xH}zaU7vA%vS!wxv%0)0L@g2h+?Z_TRwM1_dN(B4!AIo5=<8^?x&-UV_a%jB*f( z@;-)p02tuPoX!|n9<>i2>6mY_8+c_9(8SfHSgbLomiEY>nK6{4T_a-K)?B}&2@SQz zypg1!Om<+3Y)|rbwfe}-`fYL>P_NzufmzzKG!ST&sR-_S*c(g{#Vk#D%`tdmAVeO( zCSDrw({x`2RL21XKN5idMmqf0!xtu_4FJW2Gd@`>fV4zaw)iI_{NBi@YgXZkz~GBt zIs1qRD9y|5DWeoPxq>*EBd`e<)4{d6@@&t)v{?ot@285@#Lq2I@Q|MB-5SA##ee}E zt4Ju^@Xi+I8N>(y+(BWT(t#}%%-fuH9NhO`=tn}&4*QWx;j+m-`$A|(MZOg_pHnD3 z8O2f_Y6j z7=QW6&z33w^=Y{B(v2USz7|+ApyS@wFL@E#8RLVkc4<8MK0|NnZu9;v$|0V!U~v@))3 z+oqvaSsBcw>@AGxKRsp5z=cUG2C{|=oYeu&sbgHo`??Gz(W>&GrSA2U+`q8p7}t*= zuDnCAcd){L2?aJoXZ2YCtH1}ZR4)r~-;PMEB;ZUbJCETVq4P?);(=1IcNTo8f;(D> z2$oa&!(UmZ7yM7M|CMnUdvtC)%&tr+f=4%DFCbjO=bEW9vYZFD`QQM$_BA}vc2Z$a z1OB8nee`9mBG^N~HJ0A*0gHnv1qPy2LSqVVC_|pux_7Ic1WfD(f)_4pcrf=N%cj2q zbRRqmE7C5Y*`il#?b8w{71lU*-~Hvz-y{BI!{DAd=$D8e5hH2Ih5;?O{wsL-6+qtc z65|_CU|VLY z-bSmuX7{f?go@@X(OG+VkqO`OWae;xeu>^>a66p&D^GsIBJ%)N*s6$+Z4G{Vc_z=_ z1OKqc_`U5=eT@Mp_yIkm{HFHAU+ zY>MhFNy1wrkMEy)y@oY@U+eBTV6+gIFF;ALa*sAJZ{d@6SI1SD<8lEd*ac+OOG}X=n-v( zg837vl*H|ytS_j_?!SZHlX&HU#hQ_^ zGW#R<<*MS1^_v-b_F(dZ(oRJ2(Unyit>ID+M&z~F*kExw`A9;x^bEdYssjgEx|r|0 z7W{sMJ6?VHDdL$U;7GSb1vayn35zW6J%Dd#j@_Bcu+BxbdH)_(dcJw3Znm6)5oukL zgw94G@7VdDM6Vq_ciHocbw?%?(AB0d)^=0s}M5KDt z+1|xmV;k?**f(%*_q(iFs9VIJ+~NWs9+->MTcg5;jI6{4TLZg0Kv0-7*w6uwu-q3i zAoWk&j3I0cUnlwhTg3l2;&a4*+S&~4SP%9%5=l2W=!xpZBji_wCU{Ti5lxhz-ob{{ zOhz8nW`L8uFZ^{)CQsbttCd2H>!x; zu+Hj&3M?wGL(cXT0oC`$`v@mAr0*ZOF_~mZ5l$|#!bWTAVad(Uz=Bt+11onq@#S%I z1l01mK}>$K`Ogyxbw}cK#!nRMN01Ojf|n6bb|Ad>Ru;~`JepAO3$bzqRy+q!}u zaHA^2%$9+@z^9l0MuQth)hfy@3;7Ze7?nt|$ku?5iZBOpMBYJGN^2lJ6$1S}7Cw}olX7QMr&;V``U+^ArjQABv`^b~~D3LpbJ(ze( zU62y(O5}Z7>!-1h+GLAdMKLhAV#ga8)QqRz&?m;>)SCym z0AuYVo^aGZBd$mX-)$J(Q>MibE49a67xba;EN!mX-9*JQgI5*@_9NmGvt(-=^Tk>$ zn#~NRL4WXSvyfD^>M5l!$MZ`@!^?Y>U=;P8OT)Oshm4z@q{2lqxUe$F4T4ksR{2Mr zw1d5va57#b9 zs|#LitnU5%UFq&=Nb`byUe@o8Zi?9m`{dYklvu*1v5f4Q^dbEwg;E%5Kd^JNr6NVd zYeaCsej_ta>`&d>lQCx{iy|Gzuzo50OmIBf8+=EOqj;eKd|hq^;2Bf0*(&>j$IOf2}pYX2ATSbp308o=JPk}aku1@B~eX<`4| z4M4Ti$^-{zmTSk|ABd*O8eADwbFwoh`;0U1-X`G`%L1pBQkFjxM6~anVjGU!LT=hOr|rt zOAsX4o-m^V-*BOTvR}WHeV{-3fu|Lv9Ps|Tk50dqa7OA>L#YbBs>|a@PO@Ewkc@J+ zZiL3d^iJG2sRPj(h^ z70Axy`y+dH0Iu}>)@M@}<)48JQaDzHV@0o-XG~O7VMX8|!Il-4%=bOBuGcjPm(Z~y z!^9z9a7V#&|Nx3Im`0-~7$2%&mXF~GP1`9}Jc@N-) zO`P0;WIBFVDk6uf+n?FUiUkt%<}GI98j={O^H&-9{LWHI^HX8`Nh4%wf%1jzSgf_j zk?mNQYqt3J0HC&?K2<6=sY?8U4dp3_#*=i!xc7)ZBHD;A5r>GsakoFS4V>**EtUI$ z<1!d<-;pU@K0<&U8P<_Q*TCDhq;|SevW-11R#8rFWX5SfK*~(n^GC!!;%^cE1J6*v zg1p(Q-b$0^Y!p#@(8+~6FR_h^)cOb)lQ-ke1Xp=EuN~b8qa9(O$<|-})k}^1hBH|X zpfdVHnuX^MXf%>)2{ErM!NJ`4HW4tT?zTv)RulGjB|_x^e6;%1h&c+)N$E6g!hh)i zS{tKRSQCU8?h<>?*I$lWTEEVEwkH1ff;r}mSZC%H?6^)#=&E_w?gP&_Z`#k9l(_-{ z7glj7cXxh1C@F-v+jQW)QS<^DH*;peA5QCG zXxe=JrJ8ju_;vX*0gyWfFaKBIA(oVT50`quz_0dFt>hySN{b@U`#9uwf}@rEL}_nH zD8X9q*8F!gRJx}I7JmOTO|pxr{$k3Zoq31c^W1-?Fb!mf$9S-S)L`BciRCi9I<3(z zuzuluzETCht#8Bm)zLV}ni;EKfd5&)N^Zj=-#h|87Bq9ly&oyWzprJWH&YjnSi%j) zu;g|+UcT5lf(ZH$p0PDa2*kb);IXdXLAQKrKnXrlL#M& z(z*3E>41o(WG;W!@b8~ZVY;D$P|Jgqtg3A9E$-wh2H~Pt4xNN(H*Q!=^bA~X`~WYF zh-b!qIpZ6oCGA?jl5N3c^{FN9oDC0cJhMIvtMEHacxev2V@E2(8F>Jht$SLl*kY6g ziBo|^4(pqjR8MV7VS=q0c%v9R`o&d#mzY?Bkiv>d(v{`>jRL5%ShdxoTE>vYM@;nf%b zl5$59m%{#2vUv}j>fqji*}x@O)P(YWnjuY8zTKo9x$U$$$Q~u^L4`E zLAa?bdJ1SK1Zs;-EmgzsWO|)RCr>=iVY3spN#9x{yLd1ynU~e3EC%%C3xW|k%%Xz? z-;+dqig=0muZZt9&ZVqjwtPq@q{Hj1hlE{GZMDNl3i`8E<#v} z&Ho1uy@YUWtu@QZlX(*iOnAkUEj6;Jjn%R6G28`rPXal*o6y$L)TnCvQh{+-Et3j9ey`RFH#knJuu zNm!P`@2|v)$+Vd_g8Rl2h_D(>OVJan3+n^#EYrR}e_@)5n#X>Cy?-@U`Ibe`^&<>8 z5Fwz!BnHKr|0%!<59AIC_i`^%7}myDB}G`@rxRDMsNtO8S>HZFaGl9RYEFT?R}cBI z$M_Q;R8q(Ngd4k%>|O{gi}LK*NY(rOS0uZ;v~zK^O2ChZH+n6`&HLBjkpjqGLq`=> z)e)n3U;iDmXJ>HuqbVIaBFXa1s%a79wF<|*8N8%v@(We=eQ}F7 z@X^HPDUa#d>^VbEH>jQmd`7`vQdH`JVTtofN{OY_to3}8LFR5#L!~p=8lN_EdPa^; zi%Tg|rs%KRIO1X^PRxZwERY4CcwO#!GXhtbwT3;sp}#reUf(xs2;vPJ1P47wLfh*) z;DpB1o%BrV6S}A|I-Tw6Bls{{=hYs_A+M~&X_t9#H-iEFR?RMoLMYIID?v*b4`_=29v{6%k)- z8c6;T3=7+W!pQUcv_^%Y2oKb{7)V6g{A5&Wp-NQA$u9iy$?VH=pI||DMZ)S zmK}xwlLt_+4s2B&yB8Lqu*zD#fCYZ*S90mm9&{w}$l;&gg35nOq-=RsxwHUQ#JY%m zX6Z|#jl?jfaHBe zq>ZfTi6y)t-+W*pj}~6X>;SfSzZu?Bi%l8gkr%hTTcx?Q`2cO3pIkG2(v! z6wkIos{TxyQ;_^8lmuuMP8@B0ZD-Y*{P~rEAE2F8+@7uQuEUcAz{X1U5}yU0vo}d- z0zmH>KyU3)1-rL6nkN&mEl-mBuo54#X8|i~$z2@C((3rYIZZ&g7}tW|F7bP+UCt!c zA1dK=5-Vr}J@n09s$6C%c;LN%FjMkq);}AFZ}Ag(L&9W^dN#>(0edNMC<1;P!H11B zA8Dbftv=L~$0SAUp6b+@&~e7;96)9l+{k10LKDHCAlp39`0&O!k~d_u^QHrtZJhFn zh%aaOi3I;>1HY2*kKmn0?5rSYGj4CgY4QWDO^}2o*`GQ~e&WnG)anGZPUg_ZCBWqj#a>4W*wk`winSn1{Es zl(8l2Qw6%`bbj|N6Vda z|2bNdUf>9Hq88|PrA>9p{HHd5kB1o23b;zaK|S&w$?h3Ij|!uri$IQ{#%UIu%?%xS zM`)TzNA_&V@>hSR(t`Y>(#~k1P))x}|Hp(6+c*OgCf-8S-f+eRAa26FZXf{34V7Vu zYMIhkZVhV{)Cj%tkj7Ze@0qNNq}V(FI)?8imcF57nsKskPXKZB~+*ZItnP zrVkXBol6j-CJay1lf@Tt5g1}m5k9P~A~yHGwoa9TlRE%+D_m%4-KDd>dB4|+1$cuu zlpbv&qugPh2?yRs{D}CIlGVwEXUG^2(4ZR#c(gkfWYwY%ki-K^Qm@xQg|8DJ?#;A` z1!$(BL8@M_;SL%OhP$nCEI9%^Ba&&2cUIhcvhu(-3!#`)7|?j4q#}={Ggf5|RDWY8n9L@K=rMt6 zb0%rY`>un!n=0GOV_*1dAI?#jc-KR1Lrf1C#h&wX08b?Arvk#0Y5%4B?EwBe=&W21 zEdSGyY#xm)kPkhh#oDHD-*8VI>D$bSD7b&|g45xQFVyWY z@?44=7C_V~u!lL=S#imbm$9a;ym}Qu^Jm zS*VH({tSQ?M6AG#J1cTd+5WlzEY0ID+opo{V$@IWylVfM#j;xd_r{+ri?+Ad6y36C zC@s$&adFDx7rR}I=Caw#rRqcnnt0^sT?yGe6{PFt0lr53AkS0(No~Zes)3R1o+{wp zOA$tzj+_luFLD)2iN`)+Oc#3v_zoys8`j8+kP<@ahINc;fQ!bYlo@;7R0PF_$VK8H7-V3TVoVbZCE|2VMv8kAe(tGq;+8l&h~1n z9D%C=voN9IPoopXvGkh`=t8&HBd(+7CL0@0+8TmSYA7ynAM71`I)-}2jV6SO&^@vX zFND$MbLAlDev^;zwvOkOprvk}64&{CEk>S>aWr%XXyn*x8{@7O%BdQ#Z;WHaD1azfRx6 zGT_2mPvlgF$$PO0fgjZ-ut%Lu~ zD|n^-&}ezu4qtP@PXC*7*PLB{Z=<(c>)YM4AF52}-fIb0pU}-7!Alpy@BvBc9C3>H zid7{JLG3&`x;;XW;C6cc*=DgioA)S8C1K(CZ-fxD0koMOMxo)!l8~Qk;r;5abQSBQlI0=Z)*8P2eg}QOTy9|Ii`cP!`hL$j8`HjGw1Go`) z_Y(0ABU4&hKH^C?`dA8UARkDDj>aNM&zujTkmX+JlxRt%OC!xI6K4$T40s?lDzLeh zuMIYwrKUD9t^u6vL~1R`*jvuLa;47bmN&f|0Z}RdhK*fNh(tM@Pz%#kVgWy8j43i0XKa_!jXH z@tgUg*%IBt^ci={D>2d^)blEfm{-7nD%LhWC(A%zP!x_C{qUC?DPedXtm-pZ z+9g@?wAqHIDzwBqLb?l}e6(8llto~Pd}=7ReF|k&{J#Rs) zeMKWHmImra07+2j&N8CyZ2VlAAyu@f{QJ0#2Wb^RHG`%MYL`L-Hpo8VTRm*5Z*z!oq0BaV3>fLHV){ zqf2qC@_dUrq~ zM)$-`sgF1`3;p+J5JPY?K39N91jB8eLKWbRFWlnN*f|)KSI8c+8iGprH8s{9VE1aT zLcP+STCcOf!eZozC%q9YC4loy-jEUYrj=|=cGIFzyCKMLvSrlb32*F}2vXWfXxYz= zW?C`HYGQaW=17GJ(TE%0KVYIq_WH=+0FhsV1-8Ti$-hZO(^|GGM$n5S$)~dZ*aLRo zTQFNP+9BfWCWqQmT2()^NU0mx?J44K3=B((coTa2DqWii*MFcNdA}Z{q?2$0qc2PK z1H!Gat#@7VP_p`C6&F{T!06> zOiMzbvQTfqDQ`s&yv!3W{*5rQvn;rQ5vbQ+4#>SS)r=HOG_H7}I$J=Qfmj^zG&g$( zTLUJPRpfQp4vc&Qdz=UtKRLSX`pJVI6O%2-GTVFwMbGzj^le$W#!4)UOT9$AM?}OE zk?d{#Mr^sb(d=oLcFri0ZDjHC)$b%;v$Yb0HTAUX=&9nu8G1%YwM5t_a5Hi?^ZnaJ ze2#eJYO^tN9z7fEeZzPc0P>A;fe^=@J+CYeO;kt~;!3g9l4Dd@tFjLHkOV;N5O4Js zu@@!lleLD8n-%cj7P5mW-L!C139{U|13X%p(mj9WYkJc8WZ{#**$bt0v`w(Em8kq&04t%Xo8~qYNE*_=eb}3 z8pd>Nun;rrR;NT?74aqF5qs>cV>3oHTX=`e!(CEO!__@k*Y?RDV2);^-pqpZ~=R(XL-M)x3zk5d?67A+0Cai}j}F2!1lLbwwfk%6>X(|1bd5tde@*)F7cmYPQOB z{%LIT>j0w=rb5{>+GgX+WlAp2RB8r->ugnlJcXPw*NVdLWCfH8WYwWIv)GSuYDQu%3}OOuuN?D0aQPU`?#ASC(4?zZKg z#U3v|bX~uguA+_88wW0gVHrXV4oy@vvo!}7Y`)AcU znddbH@yf|;Z8_!4)Dpdsh|gB0-ACLazDHbPgyaclqY(3tDBPdynR0TWx849@LSfEU z*PTporzG&eUM?-wuWJ^k;8p{7U70hSIg2&#o7*U2_q=%NGk1PA$;IFJxu3Y|+=Lfm z>nmsD1xD*;jbv|l@@l<`LVpE;_8pIO+N|GiO>%}jC z4(~Q9x}y^H8PnPUqU-EeX2WnNL!-*Z25W~BwUm5XD2y5|XM+Ek3^(mJy{BmG3mbVR z37Rpj!A7|z8KlaUYKFbBq7{45;Ys%&b@EQuI}wEL3M3}#zWYdoF|pv29T;36o4{=;h0QMEoA{4>%j$ zVS!gUL(p8-*Dr~F>eCrH!)oCP{?qp67p(l5!Jh@!S#Cto;g(S-1^uPsyfkkah@xNh0 zIuc#PA7+J<^%vD2-MKy^BjCf`*k_>E-|Nk>0BEI6GT0Fu7{B>!-Cb7SQ z1h={Q=vsN#oaa`~MB z+L5Cp2}{o5pUE=|guAYP3NU4C*764;W|wNRTa2bOfzd`0_8Rf8i2siGJNec-5xYjB z{yE|aVPvxz1GJVmgPaE=B?1t3TvG@VdI~QVHV%9ycT^i^e)?V_{(q#$zcNl`+2@B8 zwQdK1(Ne?Gq^m7)chAn9vu5CetT6z~5B3Tw-@Fnxm~_&M0(v88e&OXfgN@kj*w=nD zgKQ%{M|`CPuqOkNy!q9nf|s=e{UhSPm^(Mw-vHZ42Skr2%IJ@M)N z`u=wmwh>Rf52kyIVnFg{}Gl! zjp$t*A~cxL)%<003u?O?Fup7IEav!ZGj;R(@2$bW6L&HGJFM^_6VW6pAe9CO`+oxg$7e;8s-?!TsSpkO<0M3py~5ARD# zVqg_#(SL`;Az>Wg<#iI925AJ|6{k$%sk07J(;K<7A zs8YV%i0%6<1C=zQMg(zS*=Km`@}pTinIWDzLT-G^ayFI<kFa*~tOFqavd2ekRAx9jHokfddX8%fcH8!Ai({MlX@rimJRl9Q32sMZeb25^`N zJ%x=tm@P-%wtm-1)wr>N%%C(Je9h`mqEOVH(pt8@V}VPK*SMxJ7uWxl+;8TriF~T9 z_S|n~@lpxbJpbO}(ar>DZ`nxl8TtMHig=Fr67fW1?@#vm%Ei5iJ!kj-iM_d*RX{Re=5g_k%pAbGSIQRC() z2zige2FvRRv?N3)Gq9wQsH6{JXK9MCd&6>`=_hZq4Dtp&y2o0Mmal1P$5auxaRfwr zdKu-B-KxoBAJ;E5st8>|DJ>7btX5@VT=3>E%e^wf?5mCKSr+^suo)`+f1x`tw;vTy zcNT*jfP__G3%~!C%7A=}yifka{{qseIE?qe#RdZ)?&S1htJ>^-i)NG35%W4^G7=* zcL%y7&@2w{3?KJkC8i*x9kcf;&*m%DKJg>sHR2yeYP7s2>3_TRx&!S2Sdw4AQTXy+ zp8c9dDy?Vbgq-MR)xeS4REKijNjrnO2IAqEwlSAB*CdZ&Gyc6J58P8U@~(27#5@BT zm}a`5TP3NFCIXNYzsNY;<#UMm8Znq);}G#%nhHuDx8Z;$8>P7X_5uKYMEn8n2z>j8 z2vDV$4kO#yOoQ2KP|4y=9FH3NIrD;hapTY}SrwQ${R4BbBU>KhcID0->4f90HYtWLM?q+LD(JXe#>>nsUl zQmI_mvzUYuM~bzL$?j(RuMG{y8yT=~=KyXy;{7{IYu~awqDFe$8VsDs13Mn7AjqB8 z6l$HeFGw({?!bZ_4d1`oe`JXEKBRI`i_a7pUH& zocR>-HR7wKV*w!9pdfZ6&tY%;s6%$&yBd( z)4?F>xiOTLTh9o$B%*gFvX9)FvX-1fQ~%>j2-t|_=p z8O6pbN^5vxHOYES!Y>WqE2pBdTs&X@ z0eY;oa1*?ku|z07u2cBY_@4?Su(R74KzWk|u-U(VwGlQdKUer>x1!_v{tHWuRBIVv ze+4WiHoIIe;RJjX!`hewQYUN0{pRQYfkUQ-uZeY?VSbJub>WGyP`)u*0q;T=*rO=` z5&$uqVe|%o)B-1uR<1X%;9wpRn(i5C&_M@zNo>3Nr8K7{iF{)hPlh22Yevq?AY=L- ztj2PO*}6=swej`x=-%))3xG1r(NK$v+ju4z+U}`1i=*=Z+d-ZGG&^6~91C zl?JX711#*I$Z0gj!h|&sm7CtbFaleh3c?$@sm=~Y{{70(!HaQD54gtzx#(>@i=!Ze z>G0d1xX<4B%$X^VmrWhNN@kOtxR%UnfXldM@S&hg_rx8(^S>oclB&bKQ;IILi=VK; zg5!FzcAh1?-^`8Y4k*C>N6U~4g4qvCTk8zL%obT1B%u-UXT+cU$pwA>WvwFeRXrk} z(OFmScjI7gRBsAP)hlaB?ZGb9qH*CXtkm_g-EZED2#oOCC{pKkj*DsuTxlVP5MjF5p``3T#K@#tH@)ns+ zV|`t@_p6LpDh)CVNy*b}6!8b+pTmIai%lm|4A@x0#tLv9wdb9clGQlTnnmSXFmrwe z{LhWrNMn*<`Lzup(0eKqkt1D}^T9MN$r4UBwyCkkfr*r_w54tQ;q!Ki5z$bK8MNu! zNZXA6&;g#Ub|b53kYEgWh`~C`@1GJG+sOtJlooDO#F#c$-_58tj5iZa2b<2)twGd? z(H=N4rMdJTOnNk3&V~e`wQeglflH1vBL@4velNDF#yO^jWsulhTA5=%Ykdx-m-`JZl#2>zpL0o91xFG~fD8v$caKHx+ zHu9vls>~Q$AJwXE7CL6mi7qvKG>geS!LqRLMk-a1GH;l`zXOrM%Bd>YAnn6_C4>oUDQ&^Q*Zeas0)QzBo zOs;KJ1E|=jMT-8b@n#341s&g+vrf#8+U>|iGquglx|8}Wg3a&`r4^opH*OXR$Be!A zPY>Z!*fMrDkYG<8uUX3=N07>cMc@%%(OXoj0oS)R2;boxI&{PpAyL5Tx_SSe#nvWz ztodnKd})T6I6wu*hV|d!_Zl*z<(aSR0N$raWnld(QHZmw z=Q@B_o3x~Hj`+#zQS(m)Nh9G$3zsrjOKpWk=bsL$?!oBbmbI){&e?=O$wzT}GNsB9 z!Cn~X=+@)ZN2J$171%cItK5T1hL0-P9dFgVLB6z1AuDXcpJH3?JX$cIxAB+wm9Tci zVzZGgz|G>4q&8Ys`-URy>dv>$zJ1{81E;z)XVl`G>ue7vLp?`hA;kGCuxw|0m^h#- zVCz86(!=43Ky2&)ZFrw0$^6M;O6_qY?Hz%YV>)%`uPfZ3Q3q>EQAt35XEtP(pPqq= z$+w6m;%me&Hc5Sl81p~SYDabpqrsn})f;b2U^p3#9NG1y+a1X~pNIr4hoOZSO@urJ z5xg>Eq8F_bEw{j?;4CG;tv?oud1?#|+xdT>vQe!nT zEs7MLB`#^B6IcH-SweR-b;XQ<%F+`6^Tv#X0|z~@MJkcX$k3s?o*xo8yU28_;Rjw+6_{hN@e5bC*EB567d;RQ3f`E z+DmE^8!=ozJf_xh-bz#B`Hi{b5hSF+$FE}8i8yrAZEJ)DGshA2FN8Xo}}cjsjuRG z1}4*(_tpnqTYYihctsWf#Q}@!^@CoUC#q0e9@Yjd70pj0sZ`7ufb!+qfR%wQV!Zc= zFpEE!$0#5I-rR>5cQ-~WwiLZab2QoE3nOnIU?JQQmL?9fBR7lNo5rkJyix+w(}`%g31dx%TFwh?K3&C#N;>n zYIEm#H^T`jzj|qA0}EmW>rD9~pWnXJ1i=|uD*}{reYZh(Z*a{c$EanKi?#J>Ri3b$lO6QhfcJI{qa;|Lt%A3+ zHr^5^Rc78ZY-57I@nD_A7oKH=0*lN5CD(O#yEiUO3}+5jxByyPJ= z7Dmx~+5D4@qUZoL+T(KVzzamDCk$?67Sk8_%b8W|vjT)Z0`YpBnJ%233YWvOdj0$d z5Nv>WzJX)}+`S{{{+R(f%ajs6T(woTfy^Rx}n2*tLvymNvil8H&PWPel0(sIW1Pcboq#(2yTYjWJ@y!r>P)r$j^|4W~5vlX;{^R!#sW7oqHE z!PG#J{4ZH@mK>g125@6m{N5%5O4qWtjB0l5tQ&Y{09UYdDz#Vi7*)pSZ4_Y&psd8k z-qAIhd2PcVDd5G(90&$s#G>T>i2*NVE{ESU^Bo&pzZNCdF{H#})&%RoSHCa{vDwVu zv_I-0obgh7Os9tAo{d7dy!P)u004jhNkl!I~jRP3*Q?>rd$Iz<9BZCO>a|I8^VLSWAf;?qYG{+LC|U{On0wVeO3t5Asjgl3T=o69x{}Y?(M9ZDH6{{oa!8 z7K>;4h8co_fd5)Z`KIiZeZ!#^sp7rHWMf!{3(`^LtU~8i& zGhocZk(aEQPiNpvpq1aa5*r_^mu*^0sl&Ei_4*)m6$Y+ai-D1SfBEz_nNVqSLm$Gc zr9$^_ju2A{LlZH4Pb&9@<;~gs>)~%0%xq{Z7}l&rZMT^O%DK%(1oi;W30|wB7}L7_ zOD)rxWc*iF^apboc0}?g|Ne~qN(n9d=~$hCX)Ff|HOc+gCev>zxlY`(+FG_dp=*9} zFtuAvJW%NDEU+fmuOKW3S}k>}IxX)I3L6e|wod*VweJoIsD%0LYft&-_6{z%$lT6C1W?85IZ_m3~~!7mfC!;T}hj_>Z3v-y$NNh`#3) zSy$7z-7*GXHmp9TIbLs#Qvtb=bTiylK#i#!ORGSRAm}mWpf?b(^zN#Lz9DPwaiAlO zZ5z#i71q&oO9q9mL30XRo|kN5)c*jpR(f{!IrnAa@CY?V@V z&;W@*cE8VTL1m(jYCVkI9gtL}YUb?WhIb@^w}@Y}nW8g1QDX|~7xUzcD)So$uvp*x zl@=fYKgrQ1(-Suz_3|h_NcN-&5MioTmH(6LOCHs6dtZ3?D@2Ie5~fUuArpGDh8-17 zU3PWGmdnk{=O7gmk$NZFJG*4g4$1`a2}-N6VqLE_nJ$`Kv_Y7>Fa z{9Z@Ak7$+gCVTP@30-gf@JhUOE*0muV`v!L zRGPsR>}yZ0yj%lF8J2dm01PZup;y#(`wRol#`%t{LTg&lf%HQNJW=sYG!ia6t>i$J zLC?XuNDD`Q{wqq8BxsPVW`JOqVj~Fv1y(VcA&*F27GO3qVx`&?Q7BvnIqq_k(!WVj zcG>O(RuS?w&D;i-PBP5bh@URTlvL=QM7+eG>;ao%7zg^j?pb(Y|Ach~%W0l%N}m6D zz5$|V&ID0Ob9O97Z4U5${f5A+0^`C&o@~7OTf~1w%v8n_HzHZu%ng_PQyJ|r8#|VJ zA2l;-o!@zt@v`QR_G;(zW*vBsRM4UrWcIptpf%3v0&Nr|5l;~>5x<*$qTXEp z9tSAP@nV-V(bBjir~j0`eRJ71$(jpB!+ekU@2nB2u=ax;@O4rDULyWv3R7zXUN!(k z53CW3qDNTRi5E*;$8{K)2m=byDoYNw|LWC9Sg0FYDthBa2KvA$3jXWHBg}?OZf1Ww zVoeVyP&SeQw>s(Q!&jvUSIqcTLnDyY^WJ5x{0Tg1OWNPpe!#$zY~E0?cv58bNA>#whM z^lUxC7BzN^3Z7(MO9+%dPk0jDcWJt_864QG5k$b@Jy;9J0NWuN_hKWRk}A5~Qr2Uj zC!dJsQ^b$VfrTN^1FL@_d{^c-FO&*4Kxj|Z^km}?mOS8}mXXd3`%QG{gP|6g(G||* zfk?PGmVZV3Yu(5eIEwFV@5x5WyC5JeS|{tj1FyWB{Zu@B%iT4bLb3vUpesD^6ly$q z#3ddx2P*z@`R8KCAh^EB@a+K5VcrtCR8S>7qcBfa-%J*-grWE^)H1GHR0qTrl$sP# zXZ}XTX%Xj8fJy{oa`@`M5GvWi8v#U(Is#>2R~cH zcB#q^xG~`gC)nLxw*M3O>Q5V?R2uEJ9GGXD=36>|YpwcLA-z^jd?L1~IC{-e>5}b{ zg+4JnyT{?p9PkYaG%Q@IL^Wd6`U?XDVO~R8e9Ko#>1qU9_UvHkpP)y61V<8#9aQ(# zCSs}1QPM9v>)76ZKwTg^OR5SkeX*A#9M|Jmr4l``Y47F__QvqNu8UBrK+8GEfy)O6 z+q*4;lTApAF(4d2(S>q@R5@ft#MxFsmA(mgduB@(>wC+h~M zvv~F3UQwh3?-y?E++^A5{kA(SP60?pDap*UH*PpnS&yB>`N8x|2;xDKTe#AvU9jAKpNScffK3MPbF#7MuGg@^7$Y zU+s9KDNHwrSTS5D3~OgEr}+`_omsOdGn0)P=;DK)@C6I{5L|Fo?kS@{3)2}%bll>z zPHcE8de6VtoGM%dPXa3)RssmHr!e_!aF}CUB}L3 z8i@@E0qMA0@u#X`e2?f5f@~0oI)fDxZKXskNp6m?(m^_&JiK}=29Sn|a95UKxrN`L zl#?%C(eJd@eh_AQg&n5Ye1w^x`()F>8#;Q%h_BSN5)M>Jeh+X}#FU1!rz-JCPMX7? ziNF13mh%qe>g`JJ>-^e#w*l}bn_#S?EgR>HSqA>aXk~^3*KpzdoMqp!XvXO5$BJf zTR;-#9#1;`YFHcCd&W z&SyCljj=FCz?jm?h5UZagggTKuNb+=uLdykh8(e23%P`zYc{O0{!@`k%Mmyd$x1!> zeGLaYI* z9t*2!ZfxMPW8c#D@e^5OZRa7EaJ*Zq)ocMhOw&FC&WH6Q2*h?A$#()xWyI>dR$=JG zb=JajGM8|~Fh(BgDa+ubF&;Q2t+9O#|5Z(z)4$n*(je~y3sl*t|7Kl;Rt0}1Sqv8M ziiWd+p}uThR(bk6Gc{(@ur3btkhK79eg=-qi7jl+!wsgbOdQCvu%Fmubq$Opgd@Kq zgmB;a1`4A#?=1<9&QfLUNd5>V&}HC0MPUs__#HNB{Mmf98fZmC1zte9rd$WHJl;~H z{l)5jVSN;;(ry5Vf}xW~7GD(HSBuRZnE2*iKfNdzUckuh5>`Doh3HmT=i||G~L_ zweEJCs+14S?yaeHyZM?C79r^2EO#I#YRHIUEFFthQFy+ll z>(9&*U7)_T`m>Ev*r;I?W+H0)m}yFTM7C1lz>_sBEGE7qbZacqo zcQA%MsolvWVMz_3hq|1N_Np;&xq%WWnjj$gH}xsa2ER%KoJR|C8#lk8vj!rdwT$%H z1hC4*i|osz^_$4h#sBPp_#!2f{gcA!NWx$7KGA|}NK!YhZ+uOD&!6~*cEdI@_QyQL-qi2yWEW&Wraa>6s}w3ZCC+*u?Y03gZrOOmaYg{TaD zO;#1y@fSuvKLQ{P?!F)ckml_r;#+-0cF0zKub%0LlY~Wswp(~ppA-zmt z?^)TgG~)Qe@4qAK5$(F<(O10FfmHmTJ_qNH!|F6}2fRRu(>_w!>NgM2Xl=ZYJoFVx zd&DvS+)TkI%>NPR~$K$FvEf&kk^5O&B|)&}J;5PdV84HG)%u zP?~b(3Z=O>^A`CiA?Ctz*8GXv&y+p6=n2&mN<;G(eDG{XJz-HAFy6Z<+D7E1m09Qf z(I^G+WXoS&kjUqVUl9`r)$F_sr#kk)_)K+h^jPj>sqC=!;c|eg+sWfed{wbo8%e;( zjY*|iT23$ETC-9~2D#Y71I^Son_r|`Rd{~qT;~*CVU2LEGb^dV_4b;*4dWk3P|i>( z6@EL2FhP*X_rJnCZVbhKGUnW892bn>LkNDfK{ntRpL3ZKv&DZ zBb!Dso6~3ki_WTF2#GVP_{ROK9Wu$N?z3J&5fDoZvhpm125Sgyklc78-dtA^7H zuVSfrm)&u_1+%$_<#S4#Hw&|i901Ax$d**DD6D)vnu@T_ir5V+*O(U{xv~T`v+2Fk zB5)XpsSv-Ysin6%>}}KfCfwpPu4_jY`5N(OL>KW3)bk~KC7ROCrH|mrR5(+L<0jn8 zheXh5)YtcjYsBB+@NWQ_?`KHX zf5Ir_jB7H8S1yn&{t_8JQN1Y5%P4fcTCFlU04lbY1kkJLYHn7{Ix@&-gF~;aAwoID zY+~kjVtBp;+G-WIuFPY9FhxuUh&yNYe?^@84}jZX8SDXt^USom4dcCW!9Al8mIm4< zLn3=v#QpmIMZG^qbP<2>8pa2d5d*x=Mh#~+{hj(9)I3$;*d&(Qs9`LN_XT=n!h1LN zwvgFM!AT#83RQvYtN^=JD(2M9=GXV4qqx9{jSTV|DV;X%e@_OYKqXySVH0Ss#Mb2S zt2o)(5`-b)5mfhuu_{Rbp7B`{0i1!}7Z{YDN?x-a04~<0+v25r>`7;1&gy`LMWj6f zHr?0~mfj&4iZzW{kYPRBB&8B(Ciqa<8y7YzFsh|rtX{UaiN=M|z&VFD*xa!iDfA~E zxMWvH{M1W!7ZOs;M&|a$>lFBgHwe_6g3?Cs?TD*bsFv!+VA1`{72)c@1O-Xjl(jqn z(fk5`{hKsjJ07w&o~ptJ_txN+um7^!M=N(+;I9fiehmxuxb6f#AjH^XB(s%lm))B9 zl?@r5=+@q3TZIi`Y3yS8_vL6sq`boj?d&j1UAA9~7zTU(8#(R8vYE;fnrnstmxp|1 z@Z)SlK=x2*wZ-#|^>3+F?YaIZs_C;$BZ;;55`u2{~v_W7CSF35;w+9Cd~g| zpTR~cD02-n-Nxv>F>^pQ#qTo;c5e)Qj);f`*ZCcPkRZ6{h`&dC;pp59{#`f&v2KFa z8qB6KmR9ZwK(7!vY!)#W1@CLbFDzPX^LKkg4W;#JCl?Q_UlIh#DMl6|Y{X@L{}Y{v zY6==6ep>!`0Q+9n?=z4OcmOy(ar(oCX(`PvH@_*%7!A11SlERmY9sy9&Cm8GyAmyq z+G__O=F^&v5!G{r&Kmhg*}>_-vY2BFn*aav z^Yz$NxE-=o&SSfzWo?o>-Z=nOv=7n0a zwyPzFi}TxArcm&)2RQgQo)xv@Jyqg=_SOo~Q3tQ!qBiz3$2AF|0>G7)_KrJ}crVBU zUlSf>GM@O}LX4;$mvDJ2)$P@DW=GQ~H1=$W5-`0>9!OmKHR#K_rDRyCRd31O;uuNv zcqR7Dlw{3-)RxB7w}`iO6@3CeHHN+RHjA}3)44c-?>?wQuyQsEphy4HL8FNeU8GR6 zgyJb8sKZ^+HAx*13~PBxQm`wyUSL@uL>ji<070ezaJ8q`sfebIs1A;1CWwB6Cc0VQ zjQS3^X!S!6HY0bw^ErUr`jHFHS7j_v5KqlNh_LPJa(gqH|CL04L3nLqwYIh~pPF&H6$9^tqD6OePr6MbB z#B9%6X2MX-Ea9!C@K;#O4UdJ3Q74!E=a(0(!4Wb^m4BfI`L@~}s3@wEiD_s2PHlkZ%wlFVfXcjk z_=~Ui(18Qob9z<5nvMK@4IdjG;!s%mXWq>Jju;|-VlnU8z*I(dB84xY zP)0nyY;D9nr3h}ar(%xAB|RDMd9ohbl*3dLM{QMm*-&Yt?oqfv2#DZq8D-RzTRdXf+uZd(x<{&@GjUwfm1usAJEg;@RU{3 zPv&+yR5NOBs7>#a+Mn#F1u;n&Dqc*5g@y6oGJw$_EmGwTbr@e*v#cMLuQt=h<5kH8G^Yvd4GNoDET)zDVz?e_< zE0xKbC%#INb4y|hFvkV|@z~(X+KNY>&8s3UKru@%x&>s%fM;7zNVvb*0r| zU)B}02FS0t-NQ6)FOGxB;Coy*mYTy{rtzey-84`>ZxSk;qzE^1hJ zfZQ^GU`_9B7KV_$J~|kMumI6gSO|nD|DVi4J1ZsMR!_8i0ZcuUhg4i=2P>GVBNgON z6JpZ6sqzBeFE4_h%;T20k@;`-5&uWT14g6rZS4_QMeIAPjt_RKH~dUkBV3Ar)(VNq zN*8)t&;Lz9SlB$dfJ`1S><6IKHH!d6()4zE8f5hw&TVaavV05>QGLNnU#ahgWk82y z{lwfng8k=fQ~!{P()|qHn%8#ohMEN?TnC*a3+TgSw&>j?Iu&Ihh_ETVC zdwhuK*@iC4P1bvym^R)^17rRH>TbTNdim0D3kuZ;B?iFrZ(P@ zIPnK#P0~vc4gWpj1b}S#>Jf_?*YGtAVg#F9Li;GNNPMGC;D!Gf8A7r(He)hr&yGB( zF@N*tVw|DX3j*+0GsG>el{0}qtgl~RG#Tx5v7EBUF4bAGIDbmN<};(y7F^L}EN~=! zNTNTkz$ICMwnp7yb&|fM89>b%mN?4N;>(c(cuZIn0o*U9e~=hU9M+lPfRF13f+$K< z%S33FK7BDgU_ON}VJ_ZjWSQ{cPgau+yjqhS;10i6Q7^kNJhy~~mM$tyA>|oVC@H}X z7n`srOqCjtem|@ZS6HCzxvqgUH>1L+ST@=11TVJ3FdJrxr2984^F8MfRbTZQN(RB* zbM8MlaR)10wg8_k4}@9R@Rhoad2QsCVt(bIT$6G8 z@R4$f80HGkaWHvhNuAI|VSg^ZbnjcsQPf``@zyn|Aj|%1Yt;&VqyYV03|kh7@#_3d z_^B)T(tX4i{!+!XKVYG66sl^FN=>|y@_|wGK(E`6*S-cv>m-~7J7L03Dd7rJ>n#kZ z;ONyx>0j{*fdhF@Xzap3)nHDq<=*xH{~oWDDdLTo_@2!j@dpo?o0Zjnxft47jY+kG;d!)Ydl01-M!?70c+K>v=xa7iWni&<%U=3@%&0LW) z8_5g;k=gC{#li1wsEOC)N-nUse!kQIRIv#B>0?9#KDoyqPV4Iz4LGr2YGFFr@TI{# zRCzl^q6A5pv;?Gz3NLim+$7#(_#a-^pWkOF=?Oa`G1| zvEMeMUydBRes##J)JU~q|AJNAPzGCc@(Y}vIf6ZVq1ul6STY5) z9bG>iTcxg_6F;u9d>gq}kHA3~AWVnQ8Y8|J1k+}UWPOc|mg>n+%)Fxpc6i#iSQy2;hXU^|v zzM--6KcDNn^&p$l@Yx7lxHGUkS$NG*XkndON?bcq!4s+cj*z~03a21qePMo<$_N!% z%La<$Y_f*is|NA)$PTX-E3SqiG76aBg#0k2NWrZZ-1Y@G^S++NG}1?N2Ed-MqCE-# zZQF4@6iq?vMAynh614?VmgebhF<{XWwzu@oS>_LFR}b7zCjVF6pft$3v8PNl5r09K zaYzv8fIQ;B8S((21yu5#z6xty6}_agZfEl6S=jakdiHh+`qtS1Plm%JO1hx*xMZ_f zMW%rtpNQQDs}1D6XF-(Ou-CmYmU{y5)@Fk(Zhp%dDB!T&XrzpnC}wdLm36+U8eEuW zL)yIf?D{73 zGAfO&*({YCZ$!U)leR2W7?N;JCd3kHtKQDepG6M|S;X_a zM+Ca*0gcbj8UzbtEBM ztF|qUK!yDu0P%BtIGdv90@L#0>Azc1!_YvbtPPg`&RHDRZOFmxHFRQ2JufeMVnXHB zIIhKRx68(%15b&b7>s$uI}{0p>ngO0g()q7Nkk!W#wr7l4an(x3B6`juXb7&OIMme zTI|!sJlQ)r{s0bM$ha!j>Vge=LUFYxie=!R?xZ#lQ-In1DytX%`2fM) zTLydP`Qs0!*%`rinK0a$Y@@N|*po*DcbB>U7F~gux5+9lH9oK;A@2Mb&hYPry2YDq z#4+NJh!gW)#;oOO@y~d?7Q`T~zOaWmWL_fdMm}>i70~>dGf41cg`GcNJ)1OJ)d8Nu z7h?XF6Q#!V`p>w?u*^qD5eD}n1C?xb*^tA9qli>(Q zI>H-AqtVkdFmY!pcJ=tZ!SeSGtiR}msh|MVxiR3<1S{_duK6r}V68@bjGjv>J0mIc z6+C6Afr%70Kwf-glc)&v^&jl-VB?I1niY-nNx(>bWP>x&UJRDAH73nSw@tuz1bda{ zdIU@QHT<5tw12SM%2)OT;Q0d^Bh@meXJZfB811PYs$)rok{)1f-qBb<>7NF?y2!`W zhs(>A!0#7?JR)T(v&(!9Lz&W*NL)dNcbI0Co#k7A5;|(fm08{iTbL2jk985SqsBSW zPFMqf-y^QvW;vN2(}4297}yukd{#AB&0CZ=TG6DJq1#!@B`_%&bp z9#QaQlONVty?B)*UP{omxUmzUYa}X8$t4Iwj2K1%bSP;el5#I2q>_qBI)K97+Ho$f zmguC5-(b0;^`Yo1O7_1+UAXWcgZ;>bHQ&JBxDd-_ez#YJjOMm^wS;HtV@9#pTp=wc9_&h#G6fuERFdY*^V7hEMLS^3S81YP%wvJyOba;4xoyN z6I(E1m?NG*OOPu$p<&EFHZ)e)A-g0+S3%))Y*R&f=rX%f;##KKDR`ojp} z(1B_izGb0eQ30an4)3tmnF#yfP!&+4`66(b;H!s;KiK4@iHSJ*3^1(lI+*TP)!+>j zQrd{58bV6Ycu@hH&H1>cS9@h&nJCL~uvc(Nn0B1qQ4o~vvO1=caR_4Gq zm|JI1Ht&G6)@tKb$yWlX9}&M=ogn=8m3EdSY=iC1)gIIuNhx~bNjF#GqmWu-p`)?U zEO{B3qyimN)ejhh+U~B*Qks$vkEB*5MevI4K4A3soba@XVZ)3^v3(;jSG`y*JDPc^ z0C2{8)l{uVoQ9Z=$*Q!AqZ8C>O49;eIeOy5yTpMei+80`{=l8=O8SQzLj|j9JqNiR!cGjEzr6nz?`9NdTX!;WCakc2N zWQ_Tl>k}&U~ccv8_sX$b_(H3_`eODO9E}H?2l>9aT z!ZV9Mvr7*gk9yrnT~LSIafKJcZEL)7V_RD!0eHaU&05nZV=D^t+fz0MdjX&ZX5p!eMKL$qh!ZE@F^6md^yx(eU!tT0OhZI)OVXYg-Q4jvw#1;Mp{#_Fk-9 zZ8h=&7}StB70a6PF%icvs{G~@jz6HPWoRn6=EjymGVCZ{sQ%Cz?a_%+A1G!P?RVnG4*_)QIJ646^= z+u}vWtOF}8N4hr!^-LPRz-8Uxo)!+gG@h$8y|nsbIve@CHBxSI20Gl*hMti(T!P}> zU=VZR16wY2!qBg%e;aOInL)EQAwdN74t%GiQyPDDMc$akAcZxhx?8}W_{VX1wihq4 z#2rG6c?o_*JR1x1!b7#EC*4Kz%nUQ|mf|^=G%U*1-ybB(?hDS6Sudl{Aj;fXE$F9K0}8e^s%%=TxlrqeO*%keZ*_Tcai}0&(#oNlFoIO>7_O_&r&+j`5^>&A8|(=v&yFMVn1u>xE2cQ zOyh!wo+A5sRRDtq#8-Z*Fr&#A!GbVY8)vX5tvFb1X2>ESAu#Qqy6EHsD|Wif-ok)+ z$q;&~!Q$dJSq<=Htzr!!vH|*(7I!30$lUyitgQp`R>1EX1N}msCOtoMcZxi-7eQqm z@mUeIpznNRRl^gxV{Oe1A4aV#R_<)iL*4L2AN>Brx!9Vt?DpgPh@W8p)pHiO)QYV4 zJ|bYn)6FXk-*r@auZEpl{MN~$L1CQ z2Akk`AsSCMl;pq@;x5T|NJVqyQ3fUDr?5tt4rfrWQ(%n(kmyQ|N+GFat1}K%J3g^R zqML&h=wwhdEuoL(ya)T6$rwlF$P){`u%4P-I@xxq?jbV%WG%tzYc7p`wbAVtlhbUt zoPnE~9KlG!(BsVJ-xTRzLTy|kzVkqtQJ=q9n$%knNikfFOy_JHbC87Wjp~?<@r8BO zo@F@f9(1MdSrwq>s!Bt7>192VUKO|_U$4IDGc<5ZJa0ei!w~js&C87@I4VphTCQ=J zlE~VCpY-YKF;g`<1L7NF&PJo)8luM>OrEn(MHK~c#fLQHkv-{qi-)Zf4AfHq+gPWA zv1y)-4 zum+*nU}>8eUx~g>_}vVG9W76l7*>>ggD1SM$4QI*9v~3UP_$1rV5p?l{UhRs`;ZCY z{4*YF!!3-aXWIkr&RGgYwZmqaCvPYWyyKEK>kO8B|K-I zIB_uv$=rwX72(18GaW%?THP?(qo)6B$5G9kel5nhEODrPqGIua3U;Q5sgCIiHze}( z&Q#T1_WT8TL}hBN4UYAlGSXa=-mUZKo(Al#^?#O@nVs$AZ?isJ{7cJLOB1g=X8aF2 zj*>G|5J~P>NTI)B9m56SM)QP6gC1eN{YUnRToZ?H*5Gz5QqRr>4-7d){9m61@S5Cv zuzT^^Lb(hBi^D(ihl_F96&d_1yyTaN=d~6|3IvV3kjzEI#Rw;`NdM9rK*JfXaf1Pw zA3)r>1gjftr27x4ZbzrtTghd z;^?Tqe6f3l8OmhIM$KnNvhWna7M67kSo{E{x`(}Js5ClG zca%XdDqXgOn?0NNJt88$*(0(t0Ze7cmb$?eU$6x|l;Gslf0YLMj^u0iEX~f+$E}U1 zU3^|8dnIJEpW0J6o)f1oLC>{+hsfO#H!LM&ZAsxD>mcemd&7kmXh3_S@pef`-Ii_u zraZ9MCj^YTANk-adxlj93+vtu?6JhKW_|9n<0v`MBg3dfrnSaS44k2_JTBs9fE9=S z)kui$hUq?gZAcS#1D(fKAF_jH6D{+{{gC?qw2Wf zz7kD${^iVBVQJbG9DYJ6l(un(#U~YGR8^_KO$EF7o^4}2;&dw9eF6Wc4xG~TM(Il` zvX*^5KpSnSV4wKk5BFgOGT;p-$E8vQ{K=M`+VFd~>%Ra13$n=;QCMs93`-Mqq(`8l z+%+4|lQHIdO9F+J7gi{$?Hcht;vDfc;v2Rw0zfKIdb$4b4J%gx;2P*$H9>i=P{w{f zB7Vbj4p`w9+Vek2s0GZJ2?oNNxNQn5=meH3S1k#Emb?h&k9FyE2rsHod0~ieWU+`P1j6SWWAyx)iT{9s`vy>xw_+M}pf{8j+SHMVPE+Gvg z(jZ>=%?SWd6Y+Q8ng9Kn4;cZ7!OoZD#24dmKeD4}C&1Z`8`*Pdca{yk;;jb{pcRkP z+AL)|JBYZgA@C`2;Dw>b3rv2G=1Gd2Jq46D3xEm>Zu1B3@s&H%gPzyu!O0hOtsEkr zh~DDR8xn%MEE4Ew_}p@wOQ55gK&o{CQ|sUJ1Rnt^p z1qFaFl-jT6Or^;_BW2Cl(1hRVt&vAaVK53K80)wIaTTOp5A)S58A}lq6$`QW{gJq= z3@va#FHrB>h@Y(~C-p?7ZHrjA5=uXSC083+lAfN1Tv+PWtL5b#eEj{o7p(*`4s3O4 zUAZ#)IW5^`PftNBK$rkm)Qni^emzZ52=pHN(FHeX|3C1@JA1fLx_-)IwzMNXN4(+2 zO5+7HDyo64mK^MgT~w&M<|TNe;Bec*(b5O0z$WZ`Yi8TABB>9p$l~1umti@>VM?JfvN#(P&s>3@NyC8l7p0npMXiDQDEBqIW0VpRu$RbKk?9k%D!G`MyzO=odzIor?nJ(FfO7e zNNpm1UnA%qVq=5FpJ*Na-w{txW5bf^OxM3QlrYj#USrhqB+n85veA`)Ufx_Ozqb)q z1sAWvT8GH+oPbz)kFA;KWb#B^%XU2oMyJ05+{jnonSN~`LiWih1mzB|bQlUB6xKwt z2f965!R)qpD*0~=;;D=%*I9j*{tT1|8Y1q-pxb0zU=0RZtlt3GZ%}^Ch3{X0u~!R8 zl?Ski$DH?&7(pbSjaMq9+MXVqNhM1*sWNw@&YCTz`LCCb|%NAn$76@h~k>v+P2dV&4n2Q{Nq6P0!_)8tYeC2l$U#%cl@c9h^yGrX1gLEG-M*N^#N4%l_6H1-3AFizja4=z_1UeH#S$-rSvgc}dgrUhI>4#A> z*U$PGMmjkk_{Ir7c_J_?C(C3|7Yf`8R9CAfY^(wFn$VyVefYmJ?Pb2PnX!MhC1^j> zu!@E)4JK_cp4Jp;J*RpRzy_nVdKRzXp9;>;>}P+A_`@@Z$rOLOyqeQLSOf6RHZ^_q zNQY1x?IShs#(+oiBh#hv)+2ah)WX4yUvXa`*4)J0~ zNkqz;0(e+4YT$PL?mS#)Ra)c0#l}=^vd;vX*n)%QPO6x7GzE&(z1i8H2k;c}uZVwI z>*Zijw8w##CK$clea+%LN3=MNgTd+t9@Zq1e)0owp}BBt(-gd~OTBReKbgMDCE+BP zrvm>3({xQvZ}5&ij|>)LE!yj3Q@x7B0Xs1y2U@TJqqzh%eCA(s67UGgaYg`I0!ZEo zfJ1WqDje0zJZ?|bao~zh6v3~F0~)M#b~F}dBrVX}F6%~?!k+tA;Kq8$tGO(w%XIg6 zd(Eiz2i22V@v9DmAJ*BBQ6U8fW#Taa6@7JTL6YSz=79JTNIR|g(p&fP6>PMN_zT~7 zv*ZEIRtssaqZP#M&c@l?Vqjre&l_7%fh*(VQ~T{v3<~uR7T9Ird!uF-8CU>! zTI*tLv6&Y*?Coa&q%eN@E#g;z_Xo++Xivt;XJ9I$GOh&-@`GQ~ zaFYy(*AfJl-`Ge8o#i7L)B?cP?s^{B^f`^v0;8&>*AT>TXI?I)t;~%KmWT#uBK6_YFO~LPV!~e`a_)_oUl@So87hOE z?b)$42z0joeU<|)p=!S204__twb~CZ$toxS2}(57o&&GakW)x-mzH5FJN=TfONCIA zImg<2Y6i20 z$5wr-Kao&hh&Ts8f>`;R11ydFQp09z&lK9=Iy;8GrYfSDcRyR<^nwXK06>oHYmvSA z&WfPD9a7atTTn?Vgi+n@D<%{i-V3HJQPy2Dj#>LxM!c~c9T7{czC_lW?D-9@w&4S{ z4Mq@`RRTs+09BjkQz|BNxPp-9<9u9Wu#((uXH@wWzM-T@S|!GR>jOAM{59f#;N^~{ zg4n~2OJlL<<$H5Dgj9wJ_ogMNteT#FShaT*9THMa*mY#?=7K^H2mVeZ zyTec~OOEYo1Lto)F5bDbO4+;%7hAc;a*YgvFPK#>e_om}?Zjy66hR4o7vPJD+n)dL zd&EzyZihd-1%L_8;b?4)LjTU1VPn~A5bGsS$$GSq)Z^aIRw7L+xLYVhX$@PJK=q6r zA@P#Pkfm|hEV&K+VLs9t>oKgaUq0-J{dOp>tEF&4dcV{G&7b)r;swFlb=lL=WD7t6 zZj_2mCARsdzQGHe@>3MU~7Y>qv{xg|#09Crp99hR7;ylxvgt&ze)M4!NVv zLG5TRP@;KT|3wjaPtJU_$vas9EQa0LP+IA6DnTtKwF=BujaynDAd3Rw(2R=bnPF%%>Q+#g{wjw)F zw88IAs5YeHlQMsP_r(_fmO)HZM+=(gj)=nKcV@r)oaN911p5WHreL)%u>Tud|313} zvsE4lM$%w+hB)UdPhHT6GV=i|9=YeMjYj%|l19dP45poHwu}!D|4}ci(S>z&9?2q$ zbroMM%sJxMh<(J*COtf}V-=>J_d$hC4LE|`rQlUf@{y52rT>17z`2k3BjO?A3&^ym zh}SG99xkqbqPp1-_fN)%9U%fT6=c0G5x*ivd@y03I$|1|*OL))DbREfK01ac>rNZi zQuuoLOjIRdZ`AQ>2%7U+f}`KRP`ax4zc0j=iG*Zt9li_iRT$%b1#*@ax@P#^;21Ya z0MPq-QkEVjIq)HOc(@FIG~mPTtd8EYq5_d&b-+Rwzh#nKNh-6&@9mJ(ewYkYeE6T{ zF&E)qsK8!~Z7FOaS{ec}*RNH)TBlBLC;xzre5{2yg&p1lnyrGkS(&n-2ZY}akMs^( z`d~f`zz)!PXKSQv))y~rPH!f>Uok@B^%f*TW!ZJj@qDAO@l1z)neD`%kLT|ZKQPa& zH8Qp~QL9B@c|#v{@=Zz`zB2dd2x<9X-QaB!IusesY~9FV@Mu2J@Vfw!6XM7-mboX- zS82R5o#l3oU<<}PHk6$1tX?%3eBLhet=s3d2a_>EGRZ%1#$VQxnDi9@mq2L01y3@N z(6Y0cntLMfDeHj5*j))XjXlai0YKW;n+-~aIAOGt!MvW~IU7a;iPTwGZ^tvIcOd;B z+=N(gCNP<7LY+e*uOy*5qlZn)Ff~bt1_FqM`au%bvsh#D<0>~O+1#2sN*gR9 zAlBDE(!;6>Y)w^Xr2N}LDl9JEk7a8icF)pQ)_|G;FcZjV%jF$c#adm?{<93OhlSm7 z!(z-6QmITF-~fW3{r=BOl6K;b)ETkJgC?@e(gqGUd|+$hgc2=aoC*ZsE zw5re*Yb{>^#BQN?3eIc^25xM&Li^b`p~0~K9^#^CwJ5c!1pTi*@G4z_3;DA}|K zBZF4k$BZ%W@hYzv^L{;$bfWO2xHEzMPn^nGlU`6$K4%f=5hddmCN<1sRW{fuQv|H= zyAv_^YK4u8yW3b1c5(k75zp9~+Wt!X-VS0`I_^vvW2O)*U_fiqP1RN!YZXneU(vTV z>1M^0*bC3Qs@L|T!{u8zE+I`#mUy>TafFr63Jb$J2q(z98gI8yQFZi)+;RVBOW7on zY-!PdSqH$d(=RScDeGYKj`Jdjk~@_2R|$m{xHm1?#jpQjQZX?U!%}UwBmMeqO`%TK zi#6e&-VwBy9Nv5ZA^2yBt|fbGV8?1xp$(kwh5bk@(u1CQ_GQF!$A~vd+3DiVhy31x zogTbOuoL&Tqq#Lxjb zqyM|5AB$xa_C1*cDD6QRax%O1CrdobywKkx{ugvmi9b|=RIelO!L${Fc^OsEHkiSp zg67MhP(H8qg=|j}jE3du5(KHB)1q1UpkBHo{rRT}uu}Lor(j2-uFEBbOs;sp<7WdM zJ1pbgGz($wtu;y9;?Bj@N@+c^90i8;BVwXWsL3kXWXY;S&!^`A8UG&f1_Dbn2uU5( zPBx(eae{3yeexm&-atOgss0D=lgEDJi91rl-hTW{201U-)kZTtqZ$`;Gi8luF7y9H z^sg+_DlNHqH6OAN5sWwu9jqP)I7@Fkkok8Mu9M&YKH@7?%L{cUtQ~9!g2K>bkJ6a9VAV;k**=V!sQovWbI)w?c}1S- zvgn9yV`2vPj0(P51Mt7Y0&LdRANcu8zfwiIJ z+e_v19snSqAWzy+qCBt9fJ|Wit8gZ^xaZuCxh5rg7^k6Eq+e=|l zR@lnRBtV|# z%b8z4E5H<1;!!bb$Kic5HTxEaxzO=z^MqbIi-aY{r`vh+C7h@gJb|w&OF{f2Jp~5?>m#h-gC58Mc^d{86SB8mh-lH*cQ;dQNg3^*m8k5f;KO%lN<<%LFy|cRr3lF!O zSA45DRWji#59-*UG+ES8Bm+G|52)>Du6MS^=XPvFCIdTTuoqe;_V}g}gj8_^(?+DL zY4%i8t(fvUT5_N9UIjutVHnXF!9nU+!um5GI+0_)*vOSx8{PtPwkAfu0kDZ(d9p_y z%j1dNZcSE^s%Uy{J}|YJiGKz0J;H)U_WaHe;|-qFNNBjorlcG$TkBZM6C$B5q|ULyXFhpNkY(%m4{v$uW3G|Gh>3l*u2E zn+5)KV@2G`B$=ra>0-~T+{2`Y@lma*GZq^-Z}+be|76tjy)i$%p~>pAVOT+#>=oui z<*SA^N)Z{U5M(l9Mirn7y2pl`=$%gJF8fe>?Yofr1Zn)cJ=0gDLPiDD1A_JE15sOP zcL%pyBp?3ZcfNxCdP8;-!MHG>uyA0iAS#7FV7(m-`7SB8+p9mncZ}N36SLYqm)SBcWii9OgJdDwMX6 zUE)K+)OI@mrJcFKl$g&Ee^AU=yb6#&O`0YaWzP|qK%WoE5=a8|2!^O^it`rZ831{O zvmi=eE0$+KwJ&j0N-y_~lrFBmtS#tNPQVt1%R#v2f%*djPt(f01Acam<%k9fWsi@0!L59Cf46Rceb<|iE6<+D+aBUhzL z(qMT^!8uiepPJx6Zhz%0$n}QVY`Y|@ZZ9=n+*#tg=OLH9WgvMbT6ptl30BEOU=?03 zMi?(XT$*Q4^`qdYq`_`ULfYgKDnuMx^OaW)%m9o$e`Y*#!wpXbsqjTxmg5rfPoVVT z)i5ww_^ia&ZP?BU_`Ttl7oXo*@6wSc*|u~5Y#0Ne{+#^cze8C4VsvQ9$yJf2T%)go zG^v6DJacAbi1L}K{RqGv>^`@6yJ8)&{mkCJqcXp>AsD>D#MgMmOHzahjB8^Lkr88G zVR0j8_?ecvTL5TLjLzQU$|{YRj(=%JMKB}aKS)9eEa1VFbPq7ij2$?-C6x3h=t#zZ#Z&M+LLMQMVku0R#D1 z&!cNN)>jHRJ6P$`aPSFp&T!C5VhdZmgH3@Jf3r*aFbd1g3gd4oGQblNqsJMg-d~(S zi6Tg@cWXF0a0dD7S1G(PlCB2$%Jm*8W=;C!8S?6c51NSEJ>szq+R79_Ol-kdCXih1 z@Cy&N5KLdQkF=bunDT%F7cX=rygx@&iHx=|3zt-C&$$T8&o}&KhQiXR8t_e(HIUa- z1^+g>PqjSJjUmxU_@|M)@`@U!1890+o+uJ60H~H=%MJ}owblivI6^Vp z(${)@8}f2N@;aIbxuKB0kb{)mgCaxE)^$s1!Id;Iqh>DnN*&{g3wh_d%WNw^VY&ST z@UN&T3w!N}z5hr8q89gP&mU6%$BYagdCxl>)uITv!IDbTnhutHO@Nrq`V1Gy0Jmsa z)Vcn0%N(WFVMkBs>!iKrGyT2^Kx> zoswtTs0e>%!8^oHTG1AKcvW)x6RXGC^=#38$Y1=;fK*40a)Kb&nCQ%1+EOF9EKw(QtD;Zxl9HgO<|J{xZM zjYZnq{NkEc{xs2W^4yJ8XGxApSzD`T_x}IxYRx2M;HakV=*L{KZ$mkUa@;*jb*x zx96cfsqvI%Kb`R}K?WxN_X9ANs_1&WbivqRmvU@uJIc>);mpG@ydAW+Z`o4MO9Oas zd8MnoC#>zt{#U@rOa?LlSNGg!Z>soLj8g3mi&ulbc0h-qdU?Yet~{}L_$&Y_chWar zfXg)!u#ZFpG2wH*GkHrrAti|Wh>~W;$x~|iQ!uHf9QKb<%C7l)@>kLGF2s=X)v#tCd zamA%x?6fZ&U}gHO(gOe0y3zNX&0Us{8_6yvwm{N=-k{_@;;#|^NZi=`^*=@Yoo3vV zfzS$1T7y!%brR)usD?Yd1OAK@RR10EZ=TS&SpSLiM>(&s*Maw(LGu|I+=O%8VUFr+ z8Vvy#_SCIZh$eD%w6s97u?wE$yao_K3>+DGvNLBm@PRJNVHei0wza_*XZBpZ8(R|^ zq(%JR`mYMiaf{QJc%#dzPznzH7b>l1@JI#i)7w*q7R{s+lV|>5oYdYFacaA|A}9Hh zeXjbzy|osxX7_R;n~=sP*4H1_{n}ti4R~OS=a})(q7hX9=86dxoV*HKx>6P%2!IO! z$w0^DC`=uoAolFcJWHs)+Zsr9{nh42x8@s4swt9Yj{ zZDEaByIM@8xPg+=hYk3?9pI}?zblr1DK1QP$TGS93Ft9e)32~w&#YK|^_0DS)q8sa z(8hG9k_pv-z{LSp4{k$GNWs1;gP5#5vhYELIr_rpxp~)#Xx5J`eJfg9_WOs{`hZj9bB;x}t6ZujfO z8{iFeeq@_)wr23-lA|d>OtVq`nn1_J2DVnId$qiD!x>bfQ25uB*w@~i>DlVq zJMLI~TIvZa!rz5;llR{uk@vq}yktEM0Zh8hk-wM6->!Yw!1FL4EK@m|c;o~@GG-m9 zs9QB@DQgmf8OXxoGaC0J;y0e0HUgY_9>V&Bk;J-{-0L<1xCJ*dZ{!!wzF@ zN-_wAv;(f=nG7t=;HKq`6kJGWSGM|_A*n5n;DC27)+#>HERU9isP{y|g*xKF zxcCSU0FnUB_xH}jx9?30Q4jzw$pW^P>K<9H4?N!1p0zn3*16e$ikLNqUb6a{Kh#=$ z8?xcIC)^a)!Xz2vfb}=*^I`+e5#J;J;DNG}F=;z+)P+qxT5)>ez`CVGP3gxSqN<7e z%6R{hCu%(s!?x_|mUwi}0bWobg+-Z-%_0^PI#7O_&l67 zNDd~OjwR;w%%Fe;2ri(I8V=7~uBfbBmxQ$tdq?kK}a zvQXNYFDwa+DKn#7kx9tqr{D@QU(FE$B%RKBI#em}c7B;FYUYvnT6o z%*2p^;>3>Db(Pcx(+>v!H&tOuhJ0@nR4hvh_uOf&5v%|#Ug^aUb`%+<+ejC%vYLHZ zKzg77RgkH-isKEzO%~=Q-Y6I*HUgr0gRiLrk;>75M`h#1go8fT(;Np%t49-UZTa5b zVyYOi1q(Istp^seF;YvAudzAh83*(f@&AeV%5kl@+BX7QOM;tHFe;gawS9@b-$bHGBOk8l9|f@dIM_z+wI+;#(r4tf2?qu}aZVJ1cQl z-0bKGwjem@SRQP;Zb7&iA?5mP4XIkP9hDuaok_EXi0Ab@zlnW5U@$6~PsS&YB-D!< zh$>qZvHpr=Kn;@@8j#OP?wz>7j?J%a7E*}@S+er92wI^++d~95rW{i}MV_p6A2?bg z=ie;Z@6o(mISwL)(_4@WsmVaxFG%~KR5b0Rp0y8uqZL8q^<+&fGaaiGBc-rfM0#h~ z#)`NnBo?e(Ci*Je&3Wt>YrhFj*FNIOW`*ADx8Y&1UrZ^Zs+Q3AC42IMWP3*>DEUjN zgFKa6ycPkVAVWHUG%v=D2HcdYv5ORN0;y@vNJBK(}))=7ej8gma&Yk+(nr=$e_X#JnK^?OQV7>k32#AnLRn}X%u?1S(|7hyB zvprQaEc8l-miCe<=uNoc9Z^ZT25At_*>T1*H#Nt~7v@;*s6NaBgPOm|jDk=dE^wL+ z*Zgh;*`rB&76^E5B>KX(t7H4#8WO@X+M5HK;QT(243zdPF}KmiXz+{=6$tAs&n|2% zh#+Fx%QoBpi88mYMUdvj)a3D54QarW1vfu(NAJYoiV|O@3^OA6mv%$UokYrE9PQ2i zD;&r~>8Hiubs~h)H%T+{#3+(8&aARJ>Juy9nj6n7enIu(!IWns3EgCa37bU8ThQ+R zXcW6-LtB8wyr~CTy)%t$6B_@yi9mhh>L3d9W$FIOPwwCng+2<~n)`_F$tY2$$@yo~d0q~qE0ad9V51(n;bX*iG&hrd;|BM< zh(XG{H`aWlc-z|pId*JyhJzkfiQxn?`vuf<;{G!TL?Bk}zP$+^3R6dAQt;q{rT^Np ze|H$u2@rg@K0OoMm;`|8NSW+`(j|X?Ndl-eU-#BEGLemh(NrTpus8tHnr?02@ZR#E z6Q%HxtIMqag`;yK+)4B+@3mpjNh*T)ph@UVJz#In@W^n^$8{1Y37b-(i6l%Xub|KM z6!DYwNTs(hn!8XjrlNaUF*p^{sfyJbx>Q2uMk{(Va>JSjRlRc(O#KLnCob=|-Yf;K7 zY=n%C?g5h;*{4*BTo9C==w+%f6eUdZ2h!K3t0%Ll+5Hn}q_#=1r}gUj28was8)wtg z;dsXyrViLi+rj+NgfQ_D@rvpU<@)|xo_4my`U`5xCmZ#h5dw)Bt}ro4p%0vg51zVI zCI~Lfr;av+qi5#^>wcck@S+!OA}oBMqA0B=VM|D8E&61lw>Td;y(^2qW9Mc}`ASWF zvHpSI+$?jy+OjbTjM6%MX>-F0OLiK1R|ZQ6GcsbCK*JWif)~{d)0^3(y?hcOTu77G zp3qxj(^)Phkwu)_i}AS=gF1Equ)K_T0t4xsjSPQy>Z0)x5}C#2d5e767KFXgMcH52Se|H{4lHIwdTi zD!2tgrmg+vOT-_`r#xl)kb+v>BM(jJO|sFk$6^=v$@!Z*AYSQRnzwh>bDpjAHCR+% z*uP3V!uYfC&$QQ$RB*6%#8m+BJ2_ImY(PLG_;Aby!3i5aET4d~;eDkQN{{PiNZpXM z!&g%uDil3;_0qB!g zGMMeUi$=>{8p-uYHBLpmjrHhk5)NM~lr!%;Szh0f+KS^kWSe2tOzA$nveUif zUxEA+2yi6&Tp5S7sDW*5fMLVY4X{FuHuR4CtX#)XrPe2&SRgvAMvY(NPcrs1+EG&m zE@eV#GSUGjmmYzJ;51n$sc6HNl~#FWVYwA`&}qrimw5l4%rY+{RuOu`IV}k(b0u>A z*Dn!O#6NrqML~Uln6|k7S8L%r5FWSd3}E1d?J2k1lm4GT<0XHS=>rIp&<*gXf$_gn zHz};D66?>uM?B+sCR|-u6OfvNQWaAJ|GlH%WXsjobhfB4yGWlG6p-Xv@1`59h|eAS zJSHjzRcA*N>M2zph}Tz}*P1r~r%`3Ylilv{Sr?NZgTd+v>ICljyyx4CAFo-ojp1Es zpBw9<8IxWT$;eJDl9<@p`gX|srmTd)S5eY4)q&Iy5Z1$-r+`#_7+HeS8hMV~ze45W z0MD1y8ftTxH5qWu|Gvyv3i^8W?#|9xoqS9Q%E=0Ur9BFDGz^ilU}|+Mc@)vSesgzI z!EJ7=w>|Tc6%>S#H$rMG0L41vivyT1JZ>*VV&1xeRr{t4uoK@jeVY*U1K4 zsoHG_`I9Mw#VzkJ?^Fn*&~vhOj4c);_I3nB*V!N{CccC)c``|8gN2{h)>qnNt6J#7 z-V(XO<|t&&U`6_)BzYI{fP`m=xTm(gmq06;Ozzj+`>)1CQ35#M{1zdrJfN| z?MY2F?jx;%jFi8`WjxzAO6F2$vqnX)u-OG+<(i3v=86MZeE16gu;r{4hBO60x`>x# z5R!KO49glE`dLqRk6kJw&rlGttW0zUWlJk2Y$zpG$pKVq+F0Lgz+Xvter8!k)EkSju>Z+W-WLAno#hxX zcooW@Kg+~T4JT@#gUk-G1Paq@S8f22H+{xbHh-PGjV)lApBDbqU1WyY)Wr8GMnmi z0-{8?Ncy;4meMx%rFkiQ1w9nzgGPGcc18+CHhR_UnaAqUz@kLLYfd3%uJgc^Y>nZ}aL^+*@NT2q3!_uiCRFmss+*8OMM(7z?b!DY z+a5Vgr_bhbs}XF%^dy;AlV54s;o>U&6K(^qe3r?%G7W0Da{(T)i z=a&EpGROloI98hgcXkmRAk82H|%5Mi2W^at=0RuZyqR+IUhbd`0 z2=Ll`*sD!btr8u#KHu`hhr;>?3!Ij=bHlDJ?C*k?|3HX0@Lh8|{=2_|49Ep^M?B$OiBUo9_wTbCv1;-qK*KD8E%E4!nTgIWx~pBx7erIiTY{ zIEYdDQF6ZTa3&YN=xqJODStD+8q1n?SaAbwP}!WrM;q7r=gp5MyZOLlA52O5u}uD* zrnnJq9}&Nxc6lNs9*sz?t-C1Q!ugh7VL^9TvGCyx*3 z&hLCqzhXQaETXbhKoXPjvphhNBslE{_UsGtkC6*pV2}iv&R^e>fu3>c2gI;x3=#!; zv3mE3l2pL~y<)jH`%fvLIgWhrmNVqT)*eszyndvc3`lPpP1I7dZY9)G%7K>VnH75l zLf#|3N1X7x`2v7~cx`7`@MTYwELgBb1Rj7!BeyUhK9EZn2i~rQ0Mg+;5X855tioXJ zmFF2|toJSAza##a5kV?{miFI;7X3vSsiPoOnv5uY!-G!CO)u6Upc89g=4CsvCt(bf=7_xrFrR}rN0|*lX zPWFvUrn$j>ejIYQrM~$KjM@pat!fcbxJkx*=+ob0o+V=s`09Kz%T! z1^O$9OZze|`Sd^iR%=(Q746|dC4qU%iU=AEV>GHqMbOzL3f!gyc;W`jWd;$ZU)lgK zG9rM|5Om42Q7bxA=2wF!z4vJ(s*S7W8reGEqC{l?IeH4VHJ#@Ipfr{Y#l*zRRk{hnmKk}h<)mxxKavvjre!OZ%Gze zp@chNrKb1Rto>wcyskcR90I^Uso8m2lADPSi513u9OP?Kq`IveHq+=k11}dE48uoHB$l9 z?>yiMRFWw$)BSoDwl1L}bnLAX(^)aB;DBti_TOPS59WFcPJzS))4Bq>Vb88s=I^n~ zATwzU<)#j_#wYhCff?D0;Ab!ECz=(Q#}W4Xiw&HwEDSWjgVv9rs-M!|v^K|L%Rvx% zdR$Xrh5c8PfnSNrf|nma-!{o06v2F;- z29~}xzb{x*0awySo{?=hoA7+(ySB{J9kKFC6wRXOm#&Jp*hzW0@@@=(bnc%V+#w1!4O{ zaM1}@U~fTK3-r0Lhn7(YSJrpWPBfsA`R@PkK2r7*w<{mzawc>f;}f3q{VKrn@k)IYLXHGJ;I=!AA1L$!D(R8V_d_dCz%mXk{> zSi%1p3~6sz?GM)VmA_BRI644VW9!a#X@cVCWpQ}%`6H*S0+3clD%SKU_J%MPFL4eh zGv3;s5W0a2Xp<+}DM_zcq0t@!(({H6TiUJ>yvUo5HsZ<1SC?j8K6{?3aAajl>m919!)l3i2H~KTym3zaTE0T8K?i>S@$BLjdK{Zdw->9yD8(KU`@c~@lDAYAV`^-DvVv&6p2je7`YX^) zPevSE)h6OC;#aD1r7wWfx^sY^N$W?vU&D~>k9Gbp8<^9>(rkD#>rsmGoNWcrdWiRH z$=O@$liV-G?YHaS0@lO$tA=*40^59F6Nmw{`ip^WRdD9-Y9oF@JTXme^U? zr?edvnRp8zS}j329rqs7tE`hWAAR+);OnlepNR2-gh6vM1HX^>FC;L5ru<-x z&KCelz6jNTE+z}9HFA;*@4T)bi5JWCsbhAfG{EY+9REWGR+@OH18MI#F&nnI!X;Hn z0NNRXyRswFmE0RPIjrrcFc)8-h)@!!gWV?`2)_R^;oins&-eKe@s6LqfI#a607;}Z zTIp!S8P~=nOCq1{Uj=}<=l2@}RsxZc^`2fvnc2oC17O00rHKG;-_moFBc7=s?q5mB zUVsh~SKMCej;Jv#t&r%v4JMSCL!pao$YV-V(hBu%>9$^Y|H%qnTRhB1#H%&!_C}<< zt@}7GG=0~&+l}$BM?&&po&LEA*Ic#)8SmKAWbAOPDd@`RIUURk6^D`IxF;J+DI^`Z zi4Tm}k~AT2*x2I33mNDMsl;xXFkacT1`h967XnR$nKQBCN5t*%MK{Y%Cx3+P66^WnLGAvij2&qKA#Ip7oprI)%q7HOI z|A_cMBklblYZu*G->OG|53w=^Gr&IACtmBQ>>msFXwWmpeX z)Q9JunO|WqE0CFRbZmPa6cu`^h4x2j`}>oZ(h22q=iJNSDs{DWc5KMg89+m7E*p4R2LL5_ZNwLVb7eIY8EJ-tmbvvy<=6+axOZIkk)cY9%_}Y65|b%P zv}RwWLs-@)k@^M?`WFs&(!Szq~;hXIIls+PTU zBL(GMqT&itt4G#CWPEV9&I{#LWF6Rbq%Ur;Y*1h#gWxOVMKgB(oeb)yB_VL+bS->n zVOXuAeA-$z(3+C5w|0w2E@6Y^bW8NMBSJ|)+pLp-&fMY$q*_7U?w;STnHaK2AGenK zhiMFE%U2zEvQY{d7qpA`8gbw_pE<%iqC$_9vd-S-h24^Hhpr8Y zbB)84>}vkMzefDaYcPupK#C*peAt>2YX18tR49uSX3wq91*X48{7eNzvY*jd`P&UB zgLt8WB;Z`*fOO)fEc~3U{7b}tkwLuKAdt>x3=KS!bD?6I4C9GK7!9)YV85M7q*Lqb zS%r>06zRQ#UHSf5D}PwG+>DVf#@?w&Q!p@U@%v|dy0&eyr;P4l$Pb{bJp7!G?gJ_m zMc6_V!pqly#7_v(g87msZeEF7yGD=-U^{YWiTTtSBO3p@*0E3KS;WmuT*bS=zY=GX zUqXeBomEmkV5B5<&a|Qw0Hqlm{XY?X#4osq8LrI=j2HU-76>$e3m1;pW5idMP50=` zSw4v=jJFQ!>?twOevJXezYe(jgYgw7>kpXY^y|76q_PP~1Hbk_#cxYg&3AF?@Q&6k ztLtn{QhkYdjrc9%`#O)Cu=5Qb#n-7dI&3iE)}Qx3@Ad~CYGD#`-zlBtAU9xIBi)~!RePab6Bt5>=q>A|FCYS z6U`GFet(mk;o0N-CE{yD9dV3!hJ`6`rXNPtW^m*L&v3^^ow={p#4rB|L8#fFw^`6V zNyt665$2K?x1c&)!$E)LGZ)}rZEWS@$AiljmoTtrE~gr^@GZc24Je@pkRB{5wD^ro z`cs(ZM6!ploODBSRIabT<#A#KeTox+TYi0~pM!2R$xMq!*Vig)V-zEa^iCK*%> zcK<<+64VeRXjd8IBP)gPd7fvOYN8+;S3$Z@(zj$40)M9CG(thoaqpdYlxAQDyF*n? z3wtWSWR1CPvWiM=)xtHJa9%6=!HG`!rwM32E>Gyz;Qwe!+THs6ZOGcTHgaNO!Mr~~ zeEbZaDBWB?Pl-a*4v_}x89s@bbuvO(B23ljDnP4+S4i+3)j zSTDfH6KK`BQQMRzJrQJ4c>0EmL^30&M^kzkS63dG&$oJDPIriiBnpag6wluHsY(bTFz$NWFqR?rcVR z9)yYu=y;s_KH@&&gOpR1kheq#SJNWBW96=5 z(EeWy!M5}fNzOEwYf+q7TtaP=1YaVy5r2of==q{8?%*TA!Af=u2VTBcE)V6C8L`enw8u5Y&?*PMtt(j{j{#>?& z4+wr&n&1(z+rpE5tcPKFu|sR6rV2sHHwKAZ5;Im7Ddol%*EynVC`=!nDWWzsu$Dx< zD|tYe>nn(DX%VY7;M?=Q4=Y)fHqtxwA(f3Q>meOCSY?BLXg(EB*+(Lbre#n~O1^#1 zx6dN8Z`|phuf}kN ztSv&~hD8!MQJ@Zd)z$S`b!GO}}r8gfu3F3+^suJ^#KfhR~ zXX-p288THY(@aSorvK0g8Q9iDtZl5Of3_!~RF+NMDDaMP;OC{(Z)1;G8m-(LYZI7} z&73=a!BuI`N``u=5_r4p+cHduA9hxOYVBX=dw7ZX!?$rFeD;h~n)v%IQt~4>s9_pi z#sE&_VtWMLBR8<+lS9O}tc*-`D6RXum6Z={!`<@fN`Ky)wDUb_^T;Zc_OJ8tx~>sm zSm^Nm84b$!7D>wW{p)Y%#gp=&0DgD)z|x}9;`0r=wSoPdG4+eF*AvaC&Dwz$kPM~O zKK%^Tq#v1H7*~@^rabU59ooj$Xt8*OAr4WEb5*G)?iSg{$%R#*bz+3ZJBE=pEN#Db z;95e7#{vQGgFO>uT=tfxn>TS53OK)4Hc!;#nTlh|BK7t-`z=dV;Da~ILAOPA{V(L( zTg)LS>>XK!l83i$-2tidvg1-)BF4cQ-0rhJBtVQ?PJu*GB}KCn-8>Qqr7r=Umgs+C zz*YgZH`i&lYXBwl+*`xb#$ffl7T>9gWaxv|9%dr$<6>d#r-+?Bw>V$J8;qX0V(0Hn ziTNoaB3{=FhAO?I@$Ze1T0#X%l*Z(jCyOktRU%YX6+f`B|3xA_Z`2c{}r7yzh95?OKxETL?5kQvN2x5n^JD~ z)}h)Rh(UYqe=uptiKo0Q?tfZtz`ONMVMn1^HF65M91b!w1D2UQ6-uudr2T?&#d|ccAWZmOZg`T;8sk=7o z0OIJu8`k!AoPyoiWKBq(wZNPu91sOtv7>j{3|Ix`nq_-ONTu{cV{&d8L_x~CAxYzV9s}T|)CLLKcVbXlhwn|sKAHf2-t*S# zz#pFp5l8D~tcl07^LeKd(~^fAlMGBEse}!DZ zs@VDqd$_R~!D%gjhP>aAzk9JUc=v;eS+8p!EEeSo$szgtyb5Xro3b}Dfw~8)J#DQj zb2dPmp-|qj*m?cGafWJ=uz@W4IVs|azqxX&4d$xmp|Fu^c^$OWPAx)FwES@0 zy?F^`Rn6R?A`6!N3?_U{ws16l+T|FPNfFLi(Jrf^Nn(gfz7yy1Vz74M119UYKY)X! zp8f!mG)(q*UjrDaa6}TFm_)Ycy7a$!=u=qpUk?|d&?$A&Xmr-ifj3rGI>A=HMf@Y8 zkN7KjOocisUH=l?Hc`e48mlE{-r^Th3Mnyx!GKyt@5KXwy=A>KN2suGscdV6py%}e zJI7)1%kuRF9y74t(z2e>+~MLig3MlRno7lXHZ1LYMbBP`*UkketRx1hqj5A{SB60- zsUcPN(2fq-U&(@94S+3c09v;VZ)}VRD(#TH85vh3H{fqgyxb$@lMT)N3lKD4)+2YX zX3WX24); zsx32+!tpiYIjKOo4+$#(bk-YKLx)@$inCC0=<7AI;1F?d&x1VSQpSky>rrv^o$CMq zfB;EEK~xyx4@#1_ElJoLTb-(R*Nb1H)jOJ&L**6^Cd2yo<=jdZujRUQjOQlt|Bm>> z9gr%BC$fXe)XQO_lDve>Z$VU(7^xCDRYwwdXTs9k^-m0PuB{kSQwQ8w>EY81=8Eg< zd8k=y+`dBprxZj_&}iwHRmrq8S;qH-1yONiV4U|Ie^YVV0uOhkwy}s|^OtSd|H*R7 z(Hv;Samc&|4pwIp78O_3f?1d+PIs;{NvL3)T4e5-kNfMo1|Y4nSggPnB&}+Qq_NHb zqQ8UY9G%O_Qd{PVRa z0=JYb3XFL&5&FbpYWwEf!ff-Dr-MF^BySzPzqF=26P}{`NG0>6om5uK*gE? zGA>Npr9S+^veVmZOL3yME*=$R3My#IfEEF4gfVOY-UswMK_WIEra!VGU~Tu6Fu z^wvuhxR)_Opaa9uj%ZoLxanN(&~I=;QVM6XB`A3$L*P#}pP)ri_Nfe4NqPWRu2euvp4c+jBYqYl z0Ii>D07~2C+f-*eW(DOnh4UjIy0_UC(lcJy03=Mcur*0;fg29?;tT@5UH?R&eYf#1 z^VNO>x|P;tvXA)1B>sPXcWJk`#c5yBL@1ksCh|I@?1+7@K9sr~wd}Y%s zORJD9%4l=l@((_|$DG1+-<|z;v;W#Ru(SYMqN*CqfeYm=k`EZUw8LjLjM}_!AYw09b=2BaJhx@LFp71@t@iw-Dqz~PqanlFSBzNM z9c^z%3hydNsrHwrfF|2df)4mO*7G z7-5jB)%~vAP7Ro>&{8g3zyvZp0yj2R>_5WgwPv8_1i%eErD9`6!^WD<2Od~5Sz!G0 znbcQjIBhU^V-Fn~XASp*cshk}Vjs$7qDs@v=UsX$i0 z0T|51o}dHnNrxn$lG0>YfBp>waBrb+OMgPZ#AS+j1^0HQ&7rg<2y`v8M(Sx@1T8oK zJ9yK=hGuF=n$AFP>z89vE z)L21ho42L6cJURJakwhQUM#10jd+jv1#|4zF{FaPo%78n#H0fkGDr9(I|xT6u(3-J zFEc^Jlsr%3ApvkY{4==G$br#Z&xB*SB+mnxOsKm9x?QaMU;2W^`fNXasp>Ae{XQbd zDK8L#x8GMl&^V)m#f>-MW&u00%>tka=Qx)mX(iP;XVoN3x+?j?>RWM4v zl=e18A+z7Noj2ZRXq@boHrwr!V7_WH^G_XUf8hLA{#v_>5g~5pVUysD;+wadkdq3l}TVlnpJQ-U}{)tXIUAZG# z#ILAUBC+V8ReP*kTRK(mlc&Uj}zZAw8+2KiO2?a4sDnu_a)FBp<#+eQ3DYNvA7 zk<<1T5g6h^g0&zLMnek9h|XZ5dt^|V@FT9P#?f4;K$g~0tC~=on89SdWRuMSR7S4K zj55VoUqXfH&wU*QE_q`;kQI(~u?!P+j~zC!HARMuydo_~?RGAfzD3-F4&NEDTR~kn zRv{QZLx7#J-0wsz#f|R~HFKdO`Fvp913s{z@ZDKKb&F(Zae!)=uZ9pE)Z`~qZqzo8 zB(wf^JibPC*fSP;zK4>3<^OpLg*I zp8Zn_u>&Ud7?P6rhW)>+=iybR4H9VCv3_&;{w3l&LX^k#G|s^aO&=g|xr>d(t#ZBp zVgFX~F;)FH_G`sjMHZmrJL?}Rx#r&iW{qLeE^C5?eb;CKZa<?-Qbc+A@Ds7}wk3TsOL2DfIGdHp* zf=wL#H~O#(dz_EUp4xQ^Dbb^NrjBQE-AC*rE)ic5ROcJ1!hCH-+$_yqj(D$(io}c3 zC2U2F=}ADiw_)ZB{-}i^+5q;Q_dMHgs)#hLFJWU6vx1Adm<)2T!F*$aAhp9Aey3Uv z5qV*0uF3TWkbAItz??_iM|`ok+bowLUHKXoQyM-gdA03Fn8Nh)f*#WPR_1FPqw@yy zmbcFr)_*lZMjsJn6_z;lpHXm9H>m2oH@$|?Y`N}*6IJletpXz9N0X0v4D@Yc>;bDqBv@Xj_|>ArxN}oK-A86NU;V zj-K-YK&fU=M%x&BSK2W8AYBV>4v_&s@|w+uM!NevTyF(b@NVRr5xxqXvVTK8DYJ&aaw7ntEdCkT_E zZZQ$=t~QKIIbM|wAS00|VDk6G?HTevF;(k}&!ZA8-x$)LOggBPX};4cdka!-Xk&ZG zo4}+SAj(1pw{{>Uicz(penk8RJ$zno-i3zcW2HXNPwX^k4Q}~)08%eQyvgkiq zLe(Y?Xdu}XomQA#w-6MYh~Mq;gtr4+Lz^A(4|B3`H6`GG^8P{SpBG6atE5#|4*Zym zf)M(8W6!@3`tBqCCri_ss<_RH2ug5U&vmxeMdpl7zKGp$X7@(*4%}C3r*J`GO_ncR zYyxUyiT`Nozh36@a0_mJAg$e4^&nRS9?5NmdWGgI^3oT6;UQ@ypI!X-h@T@$s_1w2 z0KbkaxzE~g!9YjPlPWT@5#>l{<5<$wKT*CJiF$ME@L;}XfGJ380HZ5yum6PKnt}fxnBfEY$tD~9gsUpjtlEY?RjlP? zr)sf;mFe|IQrIUL&lZeS5ZQA zR#Y4g$c2Zc=(o&zAKk+IJ13qDd|-7%Y|9yy{xOlKBhNtSQ5C;~Aa@+hf39&@VZ8A> zMpjZ#?G1gze+U3xO`qR17glO2h(SnB>&TrkOtns7>mH5@PO?}}<+zTgY#hgya-ilfc~ zun`5wfjqDVx1OwlUM$Pv<_qrp3g>+>2~|af@qn{Q`Sa0w5t}6)wSm(ZK(ci{0EqOj zA)^axgm991Lta_>-H)u&hWtyhw_=Q&l-bv{4{bOf@3!4){2#EVv#~>qqkAP`zQf}* z%#LWuCJLUq)vi;(0;Jr+oRmZlWn}J*isa(&__Y_~ZA;4u1o1!hK{^JicqBRB$Fi&T zcJtj*=P+2MDKPpGH~x+;V)Xl$JnCXxRrTl#WICt#gdy)iC_J<2l zdbANlT9m*&kIC;3rh$I6fmI`@=wiB}8qozbgn{56)A-V+BT}%ubRAp}%s<4K` zEs>}pq-?pm`P=si|GAKXs&Tl0F)9q{ONLdh7oY-Td?z^6AWebL4UgM+{;ZA|gyLao zJ$1wrtGdk=@d}$Gkq{wHShR*I(-d5-hi8uH*=X>B6E-i{HlYB#1|kWAnonBJ(-}9M zK|qZiOKCoyaGG-*|Ch~o%%|Y>@}LOys)6xjT|j9Pq)PoAaaTQ{qHqL1G6-lO|Ca8u z#}M8k{t@vd;^+1AAPVDbCTkiE{N81mpXe?Y0eoY>d9_(MJ+P`WIx~GuD5p+b$UQ&k zO#r3&!&JuF!q5~Zq?v32iDX*BhiMU5WvC|#f1N!Z!YkmJ0Cr_YLu>m~8JKD*G56VT z>8Zf)Sq=%fM`l0FJpq{-kjjF})CS94<}iANfX}P26gt(oReruZ_<|V=l*UdTv{VeK^^sAO%YRe z0k$A8TVMQ9BdRsZ_aeIV=O@V zX%Sep>koYJg;A^(X#Q*&)Z!KtgC_%_fz6$)P?-tf`pOGaCsp-R&b={Ja0Ws5Oyqq4 zN6PXX#O@=0UOsRA#>|F^7=-C23qV``KGxp%3`4qM732WbPab?@UItwf?kaYSSO&r?vPIVo-Oe8JDwO^}KHZo;iC*hj?gNPG4y zoAgT=|D(U^XyVzJ|Ern9vv9t(e!VBL7%dTdi3s{>rE1 zf=qC2Z(LGkVXA~4uJJATE;8A`X!UcUkP!#7Wx({ipd_` zSF_N)W$u$9=({9$NDL_8c?Le&SP%2xO$uKmA1+uzDV0aq9+jmhj5(zd2b25CfYlUM zaH?J-ru8H+T|x`qeZh6MeE174X`)qnQ3e<}fR*)-H)MT-@gIZ5)5VXMeEo@j)-5Us zb+e>hF#f6ydsdX}0=ip8ipB_*oBfVP74}BnT&(I@**K#`FPi+;9*lmm5LcKp63ECP z(3YJK8yzP>hq%PnlJfpu&4@4&$$t)ZhyOwuqiOYUDH{)&L|4F6JCd1>J4F3I`rNlkX9#j0>s6ArvwOL&qc z=xj$O@a#8y+&dK9X)PoZu_@)>z~{Y}@50l3QVLODF;LIgSv^%$N}U9tMI%_vVH_+G zKQUo!g1uA4Vqt*V-_{ho&Lk_yoU%pSf!yw%8R8u`t*<_tCx~PLgp~GxHDQ!9r}ALL zT1JUhS(o6IDJZ(~Ez^>~aW@pp`Gq-Eq{Id@?fvdBQLCb_?? z1VH6`wl?Urh9Umw!*ts_Z1H5Xq(u;>_SduL4iV-p ze8ZWSM!=`Wf2P=fZ{IjTtzS+1A|fnP##6n2fb-eo%SNmyk0F(TNJKKw(%jksn@rvu zp#V^r;z(loCroDpi*x2P?fMK(*v*zm^g9zn zgWaxBcW2L;>rCU)C6^%SYsE=y3IFp~Z+N2gd0oen!r{Ux4b0_8USDF~AL~I#g;6LI zWZ*~~R~MJ6S*Zb4tlyS!Bica~6BR&$s6Hi&7p6a{fP!z>MMEcHu|C5E=Q|<}5kDDE zA?o~0Lb&;C7K<3Z(E@dE!y&tM1>O(9jtQ{OY}gSc;yd5_o}GqTu!MJDw2H?M)mWrj zXg<~mUUFF-y}{TzRZ^~i<3o~xiocooU$uk}%Wi;rixK=2Wvt;%?for`h#?Tc4W zUbbidKO$aHkXL-gb*+MO?^BxbjBsHpfW0xOqssz_(XzXw;xh0>?`!@rEV8Y&)~-aJ z*3KuahNDWm+|B@o8!dg_trxGLI=(laPl$Q6*>Zt7-$3|8Z{|B2Td~Na{N#8)f{k1x-k#^1+kzZ-;$mp=8^J(DydvdYSK&(oUw<`Q0;umUv@1(G^ zJs!v-uJ+%j74V7j&%jd6d;J_SqM^+PpxRoXY8Jl&oNV}_v&H5`0BMjqoay{IF!f*p zBG32ou!et=VT=Ye`$Fg)ZP}`9H@zx8C4Q^3y-;yK0GBC|Sv>)l1|6Uj`wWz@qw!J~(a$z}JOz^_`zkL<}E({>taD^jgCtaFk zhKoJ`={q-M9kQi^Wh3e>Pbuj6f?us(JK95BYP{I~vsqvP1t-GvLNk6)A@T}W8beAu zHUaY_siKk-mCDi@UP8*$j8jto>zSSZKyAJZ026~?E+F07p4@dc6+~@e?e^;%BTOW_ z=uDR0BGGT~Cz%6WVsYZ13b>4tH5fkYg(+-JODn3ECae^Bkg9klg4R1k-Cq!4i-nnA zJ8KrZ{c^n=Sy}_ONBHQ;T62nZ01V??1{W31^<{`xRM!poXG(g> z@)?{MirTar6L|N62x4b?z1yoad0=nN8l&}v1;I4Kp?6SC>K`lsGeK;-SciFhcKwBQ zksYlqV`qUWkqOBm;I?HF zcd8^EjjeJwCNb(r9#UpOz{s(l@OXO)M-|^y*$k_U3P4%YA>t8_*TIKB+OQ7+()6Xk zf3e@(m=;T+BlW(8arGNZl}kI=Cwxnrz5Z!gI%;TXKA@;kLG2?^kxWocF9cn6188u= zz^M+CD=duX1Ud{$S$EVFOX}b|d)Ck5;$@fxMPRi?Rc4^`(Rf(bfxU)QSP!;3pBiIY zQwdbT?aAgEx)>XxM*$bL$vWW19{Y7?%Bv2-Yd#JokRqdkQGxixV?_U7FmGaHRK!i! zPc}1ej1asUSPT62{n~-+W1FSZJ6QDF;Xk@{1|jboglGvCKM4K%h#s2kxb|Q75x?Nb zDo|o=@4MQwHBuI7LPmUL6*fE@wa58XoaG@&;wJm69+c)a#GRe%7y4Zn8CYN>u7Mv} zbM3;8sNSRjKp?Q7SlLoU?5PSI4PqRVD*8F%N5l&)ED}xq5%Et@V{j+!+J;=MGn%49 z*FAt8<%2Q-U~37AeDKx2w=Z{d{ zn7$@RwgczBV@vPZx-IE!h5{K_xuo{K2OqcGRJT?!2!v=3!WF54G82Wz%mMIA7>y|1 zT3J*^Kd#sBLUCm>yKm_lkhU&tyxbxKEpQI%An;=Cj7q*i3gs!d4v2DCenklgohk80 z8_;swMdhx&*>b@u3N+NjgH3Y(BJ4KCe-xH7o%z?+L0fWu<|Ww@NqVaZ7mPQ2V@_e` zzkh}qK9RwG1H0Cb+Peqj5 zkf5Bck{L#!y{{=;x!?m1=SC=uP`T=?bE5PU=W zBqqOK2e6h{(;0T|t+q5_>C<}sHS;EHnEeM2g&%RiXC61Q2w)@43X^rIju=(V;un^m zqyR=FPe=N6wE~tpC{7%WSN8#O;8z6pa~;24u32!pQWcDL_-skY)h=HAONM;%EgY;1 zbKnnGit8$Al!kC_>%X-*O&9AsAF-ke>)hHy5^rBI(I{LQ5vh8#b=wLlLWj8EM_7YhSNn-NrTQ)vcqL9OY5TC}gIfIjJ>t7U`7h<`==J>svX z7EtUw*sO*js{&{V5n+AwcM<>7`h-V-ZidC_d4fV~1LT$MO91tkh%Z?tZlvx|N1PCX z9N2Ny9R_wNJpi4t*Ntg5DnquDmBJR$OT@q(bY+0!OV;_oRyS+w34U3tGEs(6|6PG$ z4(lRRWm{77pAhh1u4`J)gVOrXs_lF0_!iwQ5&roz4WVf8#-3#@W%3Yl&r?pi1OOCJ zMI|!U3JIFpYezQtY)?c=_di%(RjFhrJK$Su7rS`*3RjrG-o#S{YkXY;2%+-{M>XK3 z<`2Ko!6GKT@?Tc%do($V#K5CD!?({W*fO#mphbHwxaOsgg@{3&*w=_35n&I-t5~xc zSbb!lhuA=Pi6j|u7y%lukY9XXbMOWGe8v!JTw-q=xNLb*1}pIIZ#K%KW6c@@fFq>C zdK~wA#J{nbljXZL>Sxh`8T%In-BV&HAQgL(U}?dobstiV1g*wxRIGVx1MKve zBz_WwR2X4f5_bnunFW8jkb%xh_xY3_Q8a7_u7UMmmMsrA2exB;rU=vmOX>(wh4o7< ze9%B-QIru8r1-q&K@E8hdX_lN?JQ@jAWgu{I^SY(c z?6c+Ky+(h{QZQj$Z$-{5<-%x0%FcYo%>YgYHhohovVjvWNnFhB76h%g*3`4*tj`WW z?M#_oDp82_GMLKB_e@3|BmPLT&;nmk<3QEKFE3%%`4f`JME$$P}asOHwRm8Fdt8%RPjB91oV zA>xTla{~R9_Jq>Yx{pW|^jG5_B_OU1OiniPYVj2#>B9w-v$J|q&57J*6%0CRg?(`k z=nccBrGd%C?@I+)GH<3~wI()V$A8DQ4n?Qnoh0U!s&#E;z~J!+s{jyG?VgrR7L(0) z6d*rlJ2@w8sv+b}oE|~Rh&V_55%D8AKvFSOM46Fb8%uF-WZ|34urjA#u#8vB>nAhV zg#nzcV$iS$SE#KpA{{4wM{lAehmNtA>1{~Jaxv;@2 z0?rHWrhtWNO%ZS<9;66DiS5jReh0YB*rzrRaas>zkqxT_uC#wVBhLhW!qKS| zyW!*(xQ^#U1}X})$3J@LENr0VU^pQoVNnJ%PuX!Q#1?NcHRX#1L8rAu zE!_TtWcx4HM6qM+QE#e|Y7L!|^FKf@4aVhNjfdD$-plvl%&(3WogLB4u6?nmwWc!O z#;E^yY^JfPa#!q(u(L9N{LUpwp+~@O$)Cd} z;a|C{&hpyyRWfL)24IvRtM}!U!FqjwN&}uJYrnoQ1arq+=hyY~ES3M2C^n~Mn$R=8 z;?Amp83!~6#}+GZfIne0@(hB02X6g&CqfkSF}~v=9A}eJDx27HHWvn5S>arYq}Jef zM=#HhEP|7kp~ivBs?}YkLSgL&JTb}anP~_+BXUGfx2!_7ma%TEv^YU0);tZSWji(| z!;AIxSFo6>H94;~|Niq*+IDk*-I56TAX$?bQhuXI5MTlIE2?$0U+&nv`Q>HV^mWBr zH&M&o}IABpE=_?%eT*8(XSnVxdJ^Cc|b>sny+IIgca%J z#EO@=^!YaxLvAl6kvpqlUATYmM!Lf^7B#5K&Rk(t>mC5pTJ*eMkCUEXbB_MV#DF>U z%xtsQzEv=}o!<*jjeY>=&5r2aMttneZKOY5;rve2Flxf}m0*4XWWHo-0OEfhFceX` zswqrFuqjDFl>8h~TzS{@l{aXI+vE(o3glp>uPjKp@a8`ZV~V8`RJ75N%2H4HrCbja z6NS-Q$;^wrHIQj*P&EBzai#C5J;LO%lNI|j2%>{_`NGkYqZ0u>BdBAYtdxEWe@BV0voM9Kor@|a&xDPL`(b#USbyuxXXfuw?~tQs%6dB-kZu$ffEG3OWutGjH` zEOl5;wIp&E6r>L6H`1Roz`FvKRY3W?h(Pxyo^i)flRji3#g60@mG(IL6E^e8LKG%) zE)6FvD7?vvYYk4R#GgtLc+M8#bp+0kEGqQ(6;)+ZN0 z!I}(kV_nhb_4+H^>fCKWLIXd$<;vgJ8I3>NI)1TuMFGNO%1 zYhEJ0uSbc5wO|c)tK&))I+)t`_NwjKvuBI)k|WLW@I;-YOBS!@ceV^W8S&ea)z472 z|E!U5zPA?~?G-M_Qy$myq_6p&U$F&Uc8mAlEMs8Cc{p((^5$aXt>(C7zCu>01b2{ld1IFu+;mSy)vy*36p_d5tvE@uSgq! zNNs(p1RF_vFIKe+E~8jtifX-Xvvc?`u2QF>u@ZW1_2rbp7;GF%g}Kzkoi~c&_t^8OGj6eOYR8<3mSg}vL3+*M-pMliAE0DcKtLUeM@I_=N$*V1|F1r*JX_WX9V|C z#NVj#*Dz{5T9Ss?oN-{yZqhloH6HCp1@gJzqE-#q{q zcArPcs3{yPA0_|R??WkGM3Z@2=_Gn zn)Z2)p|7M*X)j~X`5i%z^9{c<%yHu8kDvAN%pUF=&%Qc>KJST&SA5LY9@Kn$xt)zx z6F$>Y+zw#Bedz)Wq(CRef{TmT%n;hBz$5_YeIYrtDfhOcQ&`|4M4ml zh^xu(b^Sn)O~luTzuM_9Y$A+|ejuqiuMwO|yCZBti2(24s55?wxU=~Hw+}bQ?R3FD#Ouy^Z&&%n2^u(AaT7p4OktV!~iohV$;i_@A|{=*P( zO;g*jMqpjUkBEN}11ItYHFFN^<-jdx4sf{yn3Hwa>1u1J?u7%Xa60eSyKvy3SLR`p zJnYYx)J{h-2r+*hf0&oxDdKB(l*K@fDZNHZy-|9=E63ryw%^W%{T7&=dXYBtqCp(Ha=@5mJvNyl-(hZmof;nWMO zGQ`H}85?%~U)c{AT*7E+LdkZ1kLZ|5S|=EIG64O7&g*YB0;}T`$w6f_wCdNmw?{_l z6xM9*kqG?>1vD}IWY0vEAhoY(QcX|agaMbB@w{m>rTce#F7#-vQHuj0wLrsnO`N9^ zR&8d#e_Kn4bOuZF-ije7EcqTf=$O!uI_MjY{(&<*@{Jojsan2nHc+rOj}c|Wi;a{D zjJ712XW)nXfH~04j7$aF)LSFd#KeQe?@LGC6Nx(;Zy!eLjQqy!^ChmgrdSk9r#IMK zry>hKCtJlcPe$FF1fe(aQe;n0;OKguEYy&)>_ei2pZkV8GzK zF*5Dp)oTuY|B4up+SHslr2?bTEa~-^wDGF?)Y?NX<&wo;Eqp*lkIRNToC)*|QTff9 z8t*?FrEp?=tk%3XC1s!oy|BS42p@Tv!dC)y0d5&BgM3^+(Q2?RG{LZvZ`=@xI)Ys0 z{7TER=WiOLKt_VWVJ!j(6Ixc7pn9-ty!sUk^pUWzV&1lBfYoFsYI1L)o_qlyu<$eY z>@!sY93R%rurR(e1=?3!e;5%PCQ{D}sluv14Nhs|tKW^3Uo>&u0zl3Fz8dvfL4M5P zOG2Dx9mM2x2QcbAKP@O*CfGqA?jl|z{xFhb%K@6qq>d3USprl+u;QSI&)xA1wkOuj zmDRW3zjB64pv;E>qKSfN5%E2_3q`LvdHf^dpAmNvKXXuqW%+5B+Q1JNxZ<&Avt4MZ8D+Gopom%7Ewsp6>~@+CV|DXFES|iQgiAPwZC# zykpG@VyKGn8$16CDZ^yxw-l>2jCm39Go4r0?87YdElP6&PuhZ)2a7>cW#2w|iTFL@ z-w}Vo$SO8vOX+annoo?9Sp4!}e-`m@C2qHLBRsF^2l*fUVZ+H+1-l^kZ?iR!($I5Q zqw-`oSm8PL$w1}FoQL>x)iACR&#==ohFo;P)&3n}qN(qIl`Y0t5Q;=ZENpOw20wFg zB@Y)lSp=<@b(AStKy?$GiPt4L$%#|>Gry4{poybj8{I5ov^GUWYe40;0bY7=7E9uz zin};}rsXMl(-X<(0Fz!3UXF?N>iSi~*pRaYLWCh5{QymmJ2EaLzRC@2BM6Y+G56#N1~b4kwdm3G-XI{=+^K!_e4*FL0* z_!{wlM*NJ;%)5QZFuIl!U-cOVx!S{8L^n(B`oZoTY-HBr`bUe*o|9ep&Hegg`fv|J zHWT<_TWeF8?r;(r?-^FOp2(E;oZmg^^SmOD5&xadAVx!WBVQ^5ng9L@N=?fMF5PbF z9J`UV3^TLhaC*r2}RU^ zHdQs)Dh&Ys8o<7friv`BzL#l*FBxZX579i?gftZf<~qakdYat~U%tZfRy41uaCD_6 z^_cAh))^XEkBL8w9E`$T-Nx^{w2*x^HL`G^x0%!g>P!)8WHbN5cO94QNCm3YF6#yY z*V`Bor2CS{vX9(rw4q_6{nrvVIsl1epto0n>pvLt5k#T`NxFD8YgC^sZeHz^M-sJ67wNZ% zuQu{P<@>Ek_$D?bucV!5zSW3ph^t5qXyO0_NPmFyn(4H&O{mI9SSI73w@^J3@t|h$rG{QmFfJ_V&0}6* zZ|^wn%FZ(>aE;yW&F_f+*kJb!DN5Q%xI=<-;oP1L(>zB^5kEsvq#tewF=8Pn;@*cP z{0|7-(^pC2Ce4%7=#uvqScH^v>FIw*sP{qj45&u`jZzyGMi`Q5qoz7;B zyL&=iHhsoHi!f*~A;tYSoY@W!kz$By`N$w?nN+@4em&Dno8J~cXN`BlYN3aGQY81_UNKUN6$@*h7sNzOpb@hp%>$(t$Dsu!iLlQ2&|H1n*aVgP}C;Q(i3pbd2k$zkSv;rEC)Tz~Z+R9L%+Z&Y~-^QR^3 z^~IX=>m_df^U$kVLKV~wPh4^r|Bm>-B7RMLNNL362uvO*=IpFEEQx+2_f8j7#qy3MOQkgkPEb?%N>XhrJlWH{brue$ z7+4(PK=fEpZYeEy2yVWDXeg7P-CEJ6wCc|lXTG&Ty%`GVfc zN#sOvwWV~Ti@HDo&ItbB5r1UuXGiJh08_Ie=<5ek=TRqXAdKI#W1VS;Mw?WeBIp{s zy};bGV5cxKlIz=VcRv~jyW@WwJ6K`eRgrDvQFXUxxjz^#aNsa}@cy(8c*U*X8~<=I z5IU=QpCkSq@oZ$%!h6s9sV8`v(j?QfF~1;)PdVs<8#^MzsBmc`_uqqtru9I=0u$Nc zJ!?zL3ihZpe7`7yU5E)^i1O!%9|Ro{`#p_Fr)&&s>=9BUfo5Vk_Z9|Ei4WEXM7y2L z-CP+_(OL9d{C;WA!&I@apqk$BPk91cCcHP{y3B6N(I|o6B7D;_#18MjhchdUf9zO= z`!#|t%qltnX~CE7S$XxUq%^t+*As%b$>~xcrA$F#(m4A7+4hE?o zGey`y>-a%}cxPihjzpC!HGouwCbd+Fuj!EUh`+knK!J?+iHHr)v@STPJquQXJqy~* zGQCvw67mKPNQu`r;;#{J6a;Tqq3qU+18*dxTZ~fh|5M{v5*s+01Lv}^aX%hBZhuzj@@$Oecbuy=S?(loGNoG5$o&ohgvO4jZkfg>!{}u6j#5Y=MJKj^R zCmFgAs3Cl$R#D?YMUT&GG7VM$NIgR>^8F5FdufBP^1r_}htrZt4yb|Qv(E&-BkjEs zAC*!QmBixaC+m8t(J@>>MGe)&X3c_l7n*9fLLVn&B^Av*86~$Qvif?ZNv}-v6q?^B z0aQwrvrz$^Io8sU(3mtsJM$F$!^Y-Py;=v8j&{0#l1fzo&b@=z%<|rJjh%=kP>oTMJ^VbHPR%d(BLRx;07$Uxy4_|uY;4Jh54mS+los{tB^>e9vuN+P zh<`vqs4y@Yft{gHPRt?Daq-_NiKiXzVR8Mz*WZ!pyaPH~t1}hr!z1q@+%!`aUx`@K zb${ z4lEDWp8ReP`&x)-u5`F=`L2d~LWOlcTc_LN2#f%Q(!{W(sfRa4;Vvl9bSDIj3!A5P zAMqUVgGQ7Gc%~w``y<1g_9oRCZMl_#XHs|x5mbAc;om!xa_1E2_lWPG)lnOJ1bJAh zHW?XSXF;e!UxlMRgFScF(%7*;Wpe*L%@TKZvP!!PRVhwpu){#lpJkx)`}Q2&FkdHX zv;ZxnMa&YvabZW)531_J6*Qc&|DFqLtdXzAB@N5mDOJHpv0ZcVJ)8PK3!WlrBo4GB z!Q62n1OM$vg?eK|%dGu(%t1J?y&_-kshwU-kD3vYOaR62{P~gSyWuxRlFP>LUp0&Z zwb#bVvUgmF#0i-IJcp<^)LJS$w|r57U7gksM4A-{2kKVexn_y-2Fpx7vPqQAR=<=f z^!7JlF=*7u|MLb$Tb~m3U2kHj3sZbH$r+wdIDC(|k9Z_HiHoT)jL8VZ1&3YOri;l} z)pWFG#y=u{{SQUR$A}6sr%vUJ!@ijJNfE$65&B|K=EDkCHQw(X1iqlaMBwTeseN0i zfE0?=XAWpX=scUp5#+AWfYsUPxh3LF$cZE>xW^5aR#smS2_xIv!S#2*j~X}Fl3CrY zYXD*VSP{vTg4+G-%)idU)=*QUb8iT9>WOx;t6Td)~NG-4FD$V6&DPfFzs1e zzvD(;6aGwr_TJ{8+;LPJ)_Z#e6Xwv6S;J?_Ar^OoXK?fa9zg+et^pu|$ zI(#|sSz@)LZ%(=~=e>{Iu6F79v zxBhE63T3-)<-hVkoE@}D;I0RP)+0{@tJfw}ARWHLwvSeQnG98p9OoecaD#D%Ieb=NaakXhc|SEtONP(dq5qXvtj^v2_GhtUX42r=B-j zaX;359AfWl8+oC(ZmpnOWFH8lGTIKvFIj$V-DzP4o@QWYyyXOpmU>iSe7mHu($~l@ z&v>G?xPK&ipRB$e;Tgac;MviIQ4&)F8+eE~(2KRWfSN`cHR;q8E8oe;1wS(gwg%wN zmoU5jJJ9$$9NZz|zasv(m(cZ6`cN{`X=L|iuP~pgOSThHH2QZc4&P`h={YnfYTD)c z`U`U%k|W61^j068vItCHRO0t^VoQAUy!&5)B?~vKGpyX}fUo@k5Io~2hQtR>c!rh- zMoY?GQrZ9b&xK3Xp*xbycQ)bxe|84VOsHTMua0MDbVv?*$oQV1H}SRwfY9H*;XNO1 z4!|pS-VkAjtOKnquPKqC9;`5;(u;`sMFu$-jx3n%_QP)`+rOfQ+kE4@o&a6F&31dGOXJfeXov)_Mtjb=-vEqQ*pI57m6gw>zfnl7d7i`} z>%d9|i3f(%7?%`KAcv$6L06$~+1OJ#Hhj@6GYf*$qJYqVDMujIeDed3FDh_nW3~xu zK3I<=VP|vNXPmL>^B?O*(B4LYG;Bmm7&%A$jzps7$5Q)o&5#4o!X|76Co-o3Qo>sj z?B^xk5cM`3&&zrNYvA{U_FsQ^vZqR(_;ZS&M-uCvX%HQFXJmPL0&1HDq4L#(kq12{ z`3!TgwZ_;;7I1E4l2X35dWdwY&#)^+vIG4RHPDfG5}>e8-Q z9)aZxkhQUsR=ejYEHA5#`3)dh%aHs7F(5-BP6opUzH|U?s>@*qNLkpwml$i#QcDe> zGX#Fk)^IArJz?dh6U0SfSPXIT;C$&d_z)>|GFw zC-TCxZBnei;sWDblJ=a8ciq7Ut5x`x!q-UL{=zM2s+5YGTMAM?K!CTUm#bhLT{hzW)h)dN9G@5juK9&{12~?fMsuU@c#^vo4Cte(`GE{FyqE7g1`HbgGmY zK-?y|`gw^Tm>?QJQ3VFKur5pEsz7rWae&V0Y<{XTw;=&h+HNIM7i+xN?-bT8dY1%% zj_GkV#(0DPmok0!LC3ZC3;U7I=JyQCM-W}Py7coCw84d(w=_V}lD3MDNjb!VO0mez zD}P^DBT#7*S;o(j0B?Udp?X#~!I6Sglc+FUS=u1`s;M2nNBqjZ?)cwoegE$fk63GK z8GUO)>2e)?^(}G>qv?0H;LwqN=m|O8&3;Nq0#SWEc2==}Gw}L|_&wtHY#nr5>JTPO zV5BLp<#e`LA1cCbG!QCH$A*)0N@jV5v9KdPcth=e-7gpL+Ln^}C#?~lw4P`j?g6RF)%cP_lj)N-x&-Gsmhg-0i@2$Rf6! zh+P*LSpsZ6vgcP_gu%~F{Oe!=KmX-l_V%=17hJ*-A z@$x60xX=@u3iz|xKp)o>!~kc|(@!tLMEATw5%!XF(gr85wL(e5_3v1e3pM%6XP#70 zKBx)_fbPkLx9dKn5e=r)en)7NlB}(pxf;2~6XQ9UtyV*Dx=a$++N~`TTrkg${;fVY$8X#;H55BjGHZLgX_sOVg@=4nf=!C$Rqs;9iqGxL4O$7FF{;e z_E|OIC!4plc$MTh6$_?%(7_%^?aGwJ&3DEPsj0N)&ZX{>N?~#SYZ%fkIf9rQNw2TS zKiV~bDQs0s3zQW^xYQsObXY0^s943G!gpc%sT2B`iC~s~e$TLpH%O3%V01;%BnoTjxHs+U!Uv43)dmBsXlFPW{86U1;NVLpDcxJs?F8o_&4P=Z(kxgs z;2lnLW2NkS{Px))LDUaOCEE=)((7EWgS@=8w!&Y zhE31Au%+EPix%lRTBamlbIc=8@wBgz$k2U6n)xTt!40FOMcS(=F>Q&jHk zQLge$H~W#W3MAmU^f;y~9KkwGoz(y?_I!}{^`n^F2Va0b2cDMS{3it9&5+uBmL`mS(WY2G9Gw2GIQ%x9B=jG4OXga+T|5X}o(^v-b zx>jM%YRe}aM!~U_IdA#%i4^(I*DU~)NLim)F2$J19FqXeLWx&$8!acHwW4-sGio+j z6-RBny;PcKYGMyeDSc&CGWlbLmSM6%E*CEQ2#)zq>|aDsaXvvdGMNpW_;z(jM&kX2 zy>zT9{$^q=RJhA);08Fe8TF`$bwPKiR^PW|7KmuUBO#^*xYssWQLV~v$uxlp7UBS2 z={O=XqsNG^jFo8DBJ3~(cd>@IorQ6k;bq+kThOUcGxc?ASyx3#NKE(wffm%Ax`^+~ zi9{1na0?4{Oa`&I{j}KcND40Crh<5pHNF5G+#{BsES^j@4yS-&Ulakuf=f@iskQ(T zq>>+>wZZE;D`T(cRh;P{5kXDX0Vzk=S?80spvIq-_NYIz4dHhE^1g58h-7x1Wlc>2osuOS%rjRI zE>bnC8=zbV5NQueL=by}euc%bSI}!(1Xr@t7=OCQ1R4mYmxTVf1pom@0o;q_a*Yi& zy(7C`?A`{2fSNbNn>L0f^Rb|$urp0}W5&Fg zr4vTsyjkr?MQpX+Dm6`6l+J1mfe$SRmuJR|96sv=o{StUsKK;$jCYorKdr|J3&H6C z%ueG3tJ|`8t#PA^SB<+C1@mt8-UqDU?Xz**@^vCjOX^r#D^CPCbdiBn_#IK)9seZV z+kui!hJq?^mpcP-lR@!^h~R0k$E*V_EP$0<%gNd*3eZP}MH;Y^1IiUOU^MJ)8@_?>y+Iu(ze@+h{jdfu2UY&WxnPvjl7zYapMR^dDWH$`!LoOOu@P}>UfK&-kwjFlXNz~f)=6+IEUurFOuSYeL(oxj{77^;Df9f9RWV-_YIRpQhv&Ap`^tTjNN zp5YFFq&|RWBs|Lh1qjpA_X;__VY1`I!-3{^YsA5ET|-a+-&nDBkElf|1IfuU6v+Hn zJ-<-Wy!XNrDy7X-mUde|sfTB{b)=Z~GleC|m`heVJ%SZ!>R=_~65u_5_%-5xMf@Xq z9F`xU^OC>lEqY0!p>*T+D)GT;CRel1$p2m_-QCmBaXUX1N2sNG`A*T}t33)_048HR zZxO#E9H=NXe&DmVfR@tSd-|%V9w^)f2kB^oCKTru=eOey3tA{OJHr>EO8Cn<`Ssdl zgf&e5iMQW_M$#fmSQau6P*pD$`7qzu@5v-BE#Uth5fQ&c{KE6Qr-=86XZoP;sb!}! zzQ6~lxKJ5|G2`Hw;QC@sMVZsHfiVyI%0^7;3LtwhKeKRsS6gE7@f%K}vW&}Tb;$W! z?jye8z#0+~7nxm>7$W#9t*iz|Yz(_k+=XJ$;{FSRT+LDf5`1Zytw04a=#c>EaqP zBAz(B^IMN57R!`Tw#c~OVA~WcMk{^vYx`FWny3=J1Brr@W{p!*y1n|2sg zYxYy3Pe}Vnt`IAddfkgWM&a;C z?M4O4Urhg?3QE1MVEO_KM48ewlod`#5}Zso_yl}^F!6)>e>R98W(0Y(r{(?m;n9#r z2ab}JR=A!n`$X=5S{sXb~Kr zYH-_#pCi7pc!hBR73J594F|pb>}Znv!W{i*D_>LoO!*U0g@T5{yM@`Z7~)O#7Q~{e zds}*oN7Eoq>))Tijx*KT13DzA5bypR5$j=i6C|gylh!<`5;Dnq|3}0_mV(?apLnVk z!r2{tFcGX@*8qCMngeTb#(pK9DG<^P9)l)hCxS{&$wf_^%QLY2xIV%qeIS9BZbv1g zWWhJMi41^H4MyeGla1{u_}GOEzD#29WUU1E_83#$kq~N{yWawUgUAvJp*HCuRYLM91yPyA(OpLkGTh-C?vn?SNZ&Xc$*gmF zu#pQj7Jo?Q5a{!2U+E(Lh`8GP?<)a(GL1(D!HUBlaUDH_5evNEgcYV=A%1=|DgB0| zzO)pzT>rQvNN1}O2x7E+IB>7|7^Z`8F*(d0WIZhL*y0GOKz+b4L_riRSVM>Xv^E}f z2L-o!!0FF8{EIbKf3rusr&~~B!gmz&3IJG-wC`j6M8N^7?#{*?R(L`sj&n%+o|KVX znj$-sq^q{+7poG<7nqMLxj znZ8*yU65?p2>mhl;q-IF)UxhxKOKVxz#uP>BcT6O`mG( z8S3za6@m7Q8vl`AfdvaF`e0)zPsNEi8T5LyN@C{vFE(j-N71LoibU|0YYe3tihyDd z`f`hwLNjgTtBkEYL*36&>Sz?;j_=JF|Al6m&Ymbg;=c}5@}~9X>+lRzG9`w-f+X~H zM`m27GWHGETw}r$t9|BaIzQV`vn2&I!MnX#$@9vVUdbg#mT|M@;7Sbr%6TXZ(`m*` zmxZ5a>wqtT?mY+hQB)o8qPr? zZR99$(X*rfEuxC}1=&wW3N{)HWLme`6S9*}ZNim$$^$%W57G;Jnm&JbkPp=^@qTUpzX7Xu3$Z9cy`9(=XPfH(Fyt9NFv zG?;K_i0SelDA2i=?15t9ga~0pLt~k$M#ytfIh&O5C3+(d0c;wUhKx-Oy^px)|Y>e!eof)(gzhjO;7kna_fH zY$!<#!BJbLAC zX7=_KGCAMO+wktWb@&fsff^h`Mg=z7&I_;iumvh8pK&smYygjVK~5 zjb0m)@0ocylLt0d1318EpFW$XS98!$d}&P{DJJ`6JxVml^nSA{PvFMx4c``44PM+u zX~%1e*cq2^1W%2D6iP;f1w47{{Ce5QUZA6!g_^q{dcZ!&C-(M!d6u zE-zV&BQU74J59H+#0p|5m74R#2V^a;*?Im2NtD=s@t>-#jsU>O_VV=<8Xh<*M|-rH z1{sZZOmFvpu~D#%3D(qju;)E>5`b&JhiWMv5GIgtx@IR9GBzQI!30BB;Ojk1R>{Ih zAhI|>2mU!MMz)-)Eyf_1oH|gI3~OwyWMk&#eI&b8Ise81maLnO%b6?OnEpd*t5aeF zwY7az5r2bpKAHfwp{(<;ej19qF{w3bI7o>>=kIoC%{{_H%}A>&lW*9Fem%OfK)(eO*-0A)UJulNBq&pI)b_ldrN0+ zErDNH;fxTXvI#3aj}jH?C{0u|SvU0}fRsdrJC@ymj&C!C4= zR<#bmvwr38jK8=p^SsG)StpCZDF;8;MBmbg&EF!1h$mV#@74jhU}K-GQGC|W{%qQ| zG2)w{vD=5kR8S-PcK~`Qs)R-FMHVd;loHgMFwkT14K%DaB0a$OLM#@+LO8WmczEVg#ql| zoA|D0T)c8&GhthD!eWh~)VF($_$}h!5p~4>WH@e* zh~g0OZT(1GNq~0N^SR-sdWefBE0x_&&TS8L#FSAFZ06PoNRiYF1NI_}6n7&JYAp>) zO-T_`2)sstJlZp1izN;H3IUeBoCY2F-uW{xpwTmTRIMFg&@XVZX%H7{tdx>2lLAVJ zpxS92@nAvgYWOa%<1S%bH|A(mZmC$23T}Q8KsAIwWTG?=odHtOpgZ@0vVPI1H8(%M z4zK_l4pWALlNoM^9uv%m&cH;Ke^G}DZdDq#)_kQZFp%f*00t~5DDg+?eV9128iX@I&AH*NdpqQ9+A*!Ra?$ZmQ^gf~Sm2KZ!w*51!&{}tx7}CW{OX!spvGU_mS2e9!fYS2h z789QXz|ksjJ?C+Oz$fzoTeJk?Wg1f3ja_N}`p-15Ht;uUN||@})l!j#3ao?N|4P&9 zbHpDO_qL#(&00#Il!<$c>>CkV#h}jEl?;NbZBl|K=>_2K!-RFEJ5cG!D})m@#grcr zTXKpPQXr$`98r)rc&x!Xb|>O?l~|sxurd=UdGO3kUOpL%sb=cRum4K<`->G&I#yfk z&JLz&kpb%PmR9(@TIq0t-hRwtUq3<(49$^Vv@J(S%{7_BuOjX{p`&CuTig7%Wuwf( zf1R+?4FGOWV9=DA(m$59pTJ4HfQPH);SDRerRPs=^#6lrhE>Z-#7L24QA3&xR$k07 z)jqOc_6{rn#Ek-~$Raj&ivxkXx4S>DSr|#nC#-)&{UzF`v?}p%iAd+t_J@E3cicuz>o^s+R>@Svdf3a=(8-DE*rfqh?b zF>lmelufii!B1Zk_IY6xNo^0v3X^E30k~)Vj@bRyD2Ra=dZnzH@j-=o{~q<oJZzy|Z4Qk=?Bmq6s6-^(msZxqgku^KAORR(D?O6uC4 zsjTfBZOt}h0L7ZZDoh988Vl0H!M|hULzcqTw`{F~DW@q7^_F`~0mNtkK(f2AHrx^w zUSpncVY~{FazSANWYwfUvZGK6Q*oVefol0F$y!uZU0^`D>l3oEosC@iu!ljk7~+D# zD4=)lFs~X%P>}KnQVa=zyWwEbkQ??YsU`OYB~uo|D{wLZhpg3du=>8=3@?%oZ{W}y<83FS#78QCr*#cO62p-lI+&_b20mw?zgkWbz6#Q+7Su3) z!-2(8`;cZ31yQcYOP|OI8yj#l`=H+o+F=W+5fL3 z5g**zXf=ez#n(oRO&05W&VuU5XBzewF?57y6uEwdwrg-kvoCnds<3nwjk){XvJr7E zY|V9!uii>jK)&pai=Np3cQCBJ@p-L6tk9u&%5X&#qyo&5n@`b`JXA{tyEX1Yg8l<% zxS%C^msg#OnW>|01Rif>02^N!3)%b{o2;5@wL4tpG20H zcr=E4u9i?#JQ1w%Pwpaqig+-Fvq19jgAVaW%0ri|1218_j>ZL6qV6 z`>#nnj@XD$$(6m5oX}m^vdpdP*F6djMT!J?+o*iGO4pk)(O(seP-9ro!~(X@(w^mZ*PxQyRthX zC??C#ZYjK6pUm!q8pz%Ra$;vQO#~8dG}Hk888JotVh;qWI3_6uQN%e}^{l53tir-* zx$5HjB?Eso!hZ9ZEnFN8lI?rFzv)d?sJEj^So3W zFBUK8#Qp$DRV0K*)0V%QpkeV+2f&5HBh`mu$k~(ux34I075i0jufvNLOD2CzF2O1F zj;2Ue6$N7l1Mq*oZjE3nLJY1jIKXwpL&T?uPvGUr_@@>Hw(!k0 z@qvRaazn}^E+{C2M8sXh-(kuYFEQhi|3BbZDtOZjk>`Q}k%XnhRB|*ct=*0g>=R}6 zh5dKd^qIkxkM=N#y*1LM2sU`bs|}8j7*j18h1G}O6B;@r(D%GEGv-C4^Spjf33pK9 z?gtEJ2AWYK{xL3vwHux6nMKbZ5hokO{lvX=7SA>)Y_%L|0K&wF zX4W0oew4nHmS9v|s!BJPW$ih={|emqYMLR9VVLX`_XaE%HGm@}r~8u&&!Qpd1-iMXIk< zU|=!luX#(RdA2k+HU_g*z|aXhVZfIJfrlk*m1X1u6|jtI?7t)a@aH+Y4awc?xRA`A zKko{?W3mOAkqB-EaIS#$E z9OcTt)p_KnU`%9yu0pvQXZA!rMG^n}(bIUsBse{h=vVGYsZaqJ`<|Gfpy6OOJXM{A zgmTQZP0o)Tb%o!Ve^*$2HApL_P5aCEEHP~JA2kubLkkRc38{)n;_Mdw_{kD)<@W_J zY~h0y;*<)HRtZx=q2BrpC9dd6HGZh#s9vz1Z0ytKIHS3DsGym@S}HcBu+Hqf|9q8D z&?iHjLZ1cRaav#bO$AgfnYGqhFKe3)w9!QrSm{8O;5lN5_(CG2PPz*dZHDy%=0?s7 zH4UKQ3nLEOh<_n!xxFT{VRfrKSWK+Bse*D{YJ-xl-I?x7KdcNNUagpM`~H!I=(vRm zO!bCun#+oRZaxty!v|~3#$d>D;$UlyQ}`C7X2DRWnz6ejAo{}N0TwD6r4KD0V?+`- zaRv%L?rKR`BugMBRmF_lFkx5`a-SUFf#7@wb!1!w4B@hH|5oWv_7c|dMy#vGeO z#%|Y;GJw+wYun?rX83l9_#E-4jVSneB^{cni}ACLo|Luhtg^c}hC6$NS8t2lTV_zu zTfN=KaA!}>+c8i;m86m+vS1)*?)(2nnQ;Kd38=~YPuA_7Rw8n1UvO1;V-!S>jczDe zN5g&kPa8gRiuc^wLdM)u?EOTUSOxjcR9Hs}^0nKa{MURxY6vLkJZl&+EYGoF{Il}y z-8lTpP_7v5#*ozQ%^La87zTFPUS&jLx9NW*8|wM00wa}Qui3xb=G`nQ=Vaz2S>;l( zrgJ(aXJ|6ssUVRVZKP3!_2v;j7-f(;a3#q>1eTs{PlxMZGrQ6vY5~%EB~TrWk8bc` zIi7TpLX}2aDch84IwR}Z+tR1uTZD%&o@lEVCK^3O{3GIj*pRi`{d?k3@a19fUB#G= z8%$MHEtXL6aMi{Zs3LB(Ya1T8&L_jGM+2*>AYDFsrmjA3ZZv(hxkh1imxh(flKYpE zbHJ0imMbv*cbfH56-Uml$XCeGW>(zpOk|m!VK3hXZ>S(ITDEyuum4;`|7t@t!eqqr zYqq655{|Tpy`y|tf!BLFOZGVCoEzC#zreqA$X-gzi5mKxPbM{7I6!F)F94;TQMf97 zZSzk(v1VtI@XQWXyJU1X3IIybg%u<{f8}qU5Cn|L{_6-zi1Z9wliA514bU$F$XqO* zecp>{GA@?2)@)n5S%=S-RyZKSP(fWS%`XvutzBRd@kp1iBHkPr3j0rKMJgdu#C>Bg zOWeuMTtJ_wuoj892Y5-!COl5YA$dpdGx@DccX){Sjqt0FmM6_|0C{rv>Y}(1{*_3I zb9#-)0kNqKNlTP3Gykx5mWC?Lf(6jlsc)_@jN04qXq|yFIm5X@HqnjJe-v=Ib1?gF zP)h262I~Ztbn+qMHR9EVy6pMrELlPo8Zt`(<*phw;yZBTwtqX?#?*$Ho4FMnl9nG% zn-?$jW@#&=05CwrY!vaO-n$1QA7C!dHm0USC%M4^SKPCx!Vx!8n=Pe&!!JSAeJ_JZ z%QHkglNKM=y-*_!hxa^BK`jr`q}KrIjZp*{?qcG2-9>z6t0$c7)jDp{mw--B1$0CW zULmN6P5vb?wNECxoNQ4tLui9p$IqL;0)O>uC-VoQXG5sjcAyjJvE?S`gjNeVnpR@L z0!wd@bMh;`O17*n4MnKir=B9dMtqNmi06ny#4qM(RWz*eAMF}|mb6d~T-*E?9RyWH zDp;&pFqIohV;as-5jD*{HSQz@Fu6_SF8NM%LkzYrS)u?~1b|b-zax4^R{D1JxSshK z{7BXyUmokd;Rxv=PvD#T^(>g$CTxhk&5bK{5%Xj6JOOQEYXZb0&QO^_7V&5@oed7J zXCX4{hgAQ_2sNN|`1TTNlt%ps8mf)g$xP*&3UMl?2sIS+&eqnjU-GFN22d|`*fRQy z6;O5-SQivhO-ywaM1>_?Ejih3=I;~3sNVVh(s=vAhD;DJhCjRCz|0G&yK0~Fw3_dnQ<0Q3;xFGTD+E<|we zVl?!|`O8tNzhOl?w)1ElO-lxvQqana1F=d^T6=X`K3TkaYh-*e_4Q|)C*TE_$-h}N%LB)8!Zby{6!@4MdcB=Q z(ba0+xEcshL2Fv}Ro&ivvAEnZ`|b(gnwubgu;6sNft_W}iCitGO4dVDr8;ac9GBL! zG_f|ll@i+ofr@dp~1GAvm`4#{j$R(P^APZ2_t9 zgK7Ewx)(+Kg&?a)l@Vt&pv=DC)UgL9cEXG#CwhtK!8Zz$>CI;_$$K`*QKM3UVx9hl zrTT9Ofy67mQbYNTh4`Hg;}n5jO!*K-Sm>-|7}$o0V8F}Xk%7IDJWnjDbfWz_e;3s< zk$;PQ&jEBJ3sJ8FK(JvK?EH);nj)Sfo=Gl0Ok%=B_$MkJlkK_q{Du7sfXNwjq@p{P zU@+rNb?nMZ_w4or+cKD9R0f-Gde1PzS+vuY1ykawgPdmI8J8{Rky`*eQ)CeiPSZzK zCg%6N@(~Am`QEt4r{r;VvO*>CllhV>{F_99ojspt@%=f5`4;ia_>gS*pqFlQoc;to z+OuuF4gIUElm0I8UkfN!NBoXN?!vh}E}!RDM0`$E9KmsWV`x-HQC@WO4i&@MZXgBG z3VEW)_a+}d7OzV;9UM} zI2fr9Qkh&@%i6%+fQK&ZDKCpRXI7o>$z4QYh<9%=)0D&QId}E4Yvl-74dCyFgi2Qt zyzLtCE#lu1pCW#lr zv-w#(^kCF`Wqz+;auh!;|HzV7Ja_R8D-?&`g8m}e#lmP?6~2X1N_*Vzb+d@ABb1x$ zsA1MXT-ssWdn;hd(mJtMOO_wIdRP? z{K>#rd(M~WfTOjxNoH0j@X@cL0*1WN_3%U_9&JGLA^8YVc{wN7h;aAGN13$0Ww$5^=1qCgs*@fL6Bfb-^!+J%;zbz2>ftHagC$Gk4lzv=I z5w=|f-lO=;IQ9p?@a#5vp841(0CX~vU4&N&cQ6qQ$ECXH=bL3xXF7cjT=337;{^rI z?VE2h(6G!aCzlnmR1hUo@<6vPP$n(UJ0{_h+o8W%~tVj<5)8L?{)z8+;BnMtvK`*{FT7n1_J6p zclRb~3`=`^4(atH*VU56&%t?TX(4Mg0Y@A$U)OyLZqCJ}%{X;ReG+{<lhitbS7)l`lTv1 z6>vo1<`GZC%6S#V9CgM)-(C_hON6hfZUblb1QOdWVyj-A|$7F$jd%3!%I)KwXiDr7TE~4^tgF$KubtVK5GE zOzyuh$o>F_u7;tODiB-)75l#uGqy4`Q)3-b~RyMC!$1cC#naB zru(n+C&QYsxv%75d)8%I2aulq&z8M22k4k1Wc&tzmHmi*@Wz|m=BZ6<4XH1o4oSu; zyqn}i!zIeZ|H=3!JKowYN5zr|RBQhgbn3XCeg2CJ%u#1~;Lq31lRD&Wao}Rr2mIa@ z1eO-k!%D6tMed$$=xy40ndtC+1LUY%E_Y(gM-a!-E)IQ4XDsW9l1;`_Db*wyOHA?e z<{Q}Cr-+QFQy(MVBYsc`h=@lXx{+Ga+z7I#>RoeIPllSiWuyJ#CH+W_mvX-+Ogu-0 z|Mo_nV23hX^c!)vhI6US=msjLDF>Px+aV|xmPKl(HV|}S%s~s=_9Nn};iJK3Tcw+q zS;{hC*f1OOd3C5~5Oc!t3qS*(o$R%xxv z6_-6Sx9=W&d|C@JbpI93>RO3w1Omj23Y zPew$ZkY+8CaD}A@(?Y%D$;5wUs6{s@l)&7JT$}1|$|U?D@PMtF4>L zt?TUOIZoC(KG6l%W3!*EjeFYM0?B@I(^-iyDhW(EWaUByDNHw5lvZI&NN%+D{!|6^ z&K!_}z1TydIF)d;YEMKW1HCYd!n@+sNaJ(DQZ37zq#mdsU%7|YFzw#@aa(u5IY>pc z$3#Ixe^6O!w@rX%%s{UrE)jpi#7hEmW&ztI1>ujJ=QW9Qhl)@?`j09)-?om2V|Bqy^^fXEYhD$2QEMA z1b&S8uZSIO;CJAGAoP@+qLf8agF%bOlyp^TnR4QB9m5N(^4^?YZB?Nj$MnLEZcvD4 z5Oj;Rm59D;_M+qbb_tH%o6{ZeDqd$whTGX?#@#4pVYIpqsqn z2C|4xWKpUnEh6Y>@|d~RC_^+TG0_;pgp<@kjOTUGzVMhRYf*uX?S0NkWx z9Z|Q`hE;EGu4%jZL;z6?Xe>C-KfIC!-|k;VH{ifJjQ$SW7);H+@hX8Y1HoWG2%!qv znI~(c1ae`}U?#9H{Cxg4X-O2%ZvkW?g+AjqM-KW0k3CsUsoBh6GOR+_yR@Na9Yy>{ zh`7b8H~-SI=R1=EsQlYg=(;(u%pT$+yd$qX0xueVd$$h2;*4a@d+T3PHFw05rz#|s zF4QtpaGyDzY~oYC(@{1h->=Z8cCLGTxU_xS8dh1cDzzo3Ajsy<%R1c#2S2j&QWRzE zSz*}ZxMo3=!3?DGd%W*EaxO)`jU*<+&P8wCZ*`+In*}_#b2WR^HdUavL*-J4+pAhW zP51{5xMGV7TeEEdswAq!5o(9!u+$bYtpnhd$kNjb8#vjPCi5L@ly<`VONu~o0u%p9 zkNp7!bd2~!$--US9FPy(%x;rI&)JrYNZnBxtCl*sa&3S;Da=F!-7otEWVah_q}mfD zX-&j0fG}a7%syhb2B8z^TvflG_p7Px2`9UAAExwdXJecun|@xfKs(Ah8+lA_0YtW1 z5Luxr&U9JZ}b&n$xSQ-yN)I#ox%cZSYqubsMpqB|%S3HMNym zZW*9lkK+5Z9B8Pn$O~^yDTFy?{1+qDcb5EyNm%9sun=MUC0q?=%?R$_5zi417Pr+g zvH-CI`W3NQ?<~j8$Y6=m78HuY)dW!TrqKZwX)d6>Zyx%|&V2`HJ6coI9n1D&;{|W8 zfE^}X^?a7< zR=welw3Ap$19|zz((fY%f3o2$8>^s$AO(t>#vQ=2T|d)0z*;YKVeY`-XRb&+ekB&Y zz)*92^ozCe?`@s6I_eSJ8U>MCdy3p`VRiMvsM>8Mpz{1X?r&OB%1RtIDzImUR6JmY z3gtPFbu=`XzFY`1D<&%wzSlOZAl4K}yL{%3r~17;rC90MXealkLfH`?l!8kAb>%|Z z_~4bXwi$1$!Ew2voRS8mEO}>!d_5SJ;Y@5zd*Lhu=#o{G3{yIhigq>$nGT%6?e(85 zdIy+BMa-|bu8Kr+LBTnDUgER_00!&bklcC^19~368xaSR*v<}p%@#D~%+l-6SSnpY zj?*j|1=j-l4|Yncmo$W zyN*{(;CqsRE9>4F!&5+b2w3HmM^X_)uv>d3f?qiE+JY;-b+?&8&SH-zcIb_KQ=CO- zpdnAzPmO@XlA*fdj4OWL*`h2CpO*;c|NX&t)|g<)8g0mkvcV!1 zh%E+)oOg<8@@P+lhZKP4#_ffPmNWcxZ%t6IrnFnS@y-;?^Dm^^dIQjlm0g@6>6ly6 z!-}z>E)b}o0pHdRphmDVlKf3e)Sv?YwupG*ekx;lG{@n=nxdw43a=Kb>r&{Y+S7zC zWU%aP;Q(aT7bfadeAWngGrQ4Gn|FgOq!7x%Bke*S4=bkIAWD3R+a8A&#Artbjm z%CM#6n`y`0NBO(#N5^=+oG&C78C=w43C(T)Ysx+E5&w?pA|7pC!epG=Y>R0C_pCEx z3m?gb_P_usbvAZEAXeJ(!Bi-Brb_G$&HlXe9?Ms;Sjv?Ln=Db%gPl^SaNjy|?&1@QiLgFHTT`Wy(xz4N2^&YK*=E{BbH{9XX@}$EgDn@34Y~lYUJyh>p&^Bg=LnR*Uh^!O(jT zz_&ht%!B|t%6BatzN7U3`Uy*|FK5vY7}mrP#$fkS5%FZstulUYG;w}u*}yge)k!L> zf_gPMjYf-H5-EFTPV-ELoLkGlJ@-7n=Lv2$^Xu<%QM00;!o)f}VMDF*ZtcM3--SbU zFFo2;kpkMU#rh2(z&q+_Z)gs*Z98M67Ahza`0o*au++-dGnmwOOQO=1ZxF1EP?QH7 zG4W*?CsQGVvazO#^i1y=7&rgUQwWjtTNNRlwAWPh?$=)(-=!c%AFMV@_dFxamG zhMYDlQ4^;6z)5^G)TgLeC9{mRe;xH5ZB=7=$OSe3X{{j*PRqbq%z-%X5zidW13^mc z;$r{8GKd_!E-vqdoHRJd3lsDrrc79yq#OCu4p^+OzlC-Yc~;oz(zgxf$$wtFx)sAF za1^k>14`(Qm0k$(MxMWR0^*4_>F|g-(Dkq>L5hfL#P^67MpX^As3lh1nlx)8gUF0R znj-!c@n^*E_@(=ZV2#?%08$c$C#qbRh;KOWM?BAy*T71BMu3+}a};59)Rq!)ngR(m zsor?zN-j(JzhqQJ0s>}7aYSzA!CI9$F)|ySz_JqAuqvnzTw{TsO#@Jvi@swTf;thU z8@q4@Y8wvm^VM0QV6tBiHh)&ms9oET>aZu9d#CRi8FM3`6>INZE+0xJ`Mh|~0*>l# zbC{Zn3o}3Z16jLrw_&-3=__CM>r1DglAqG~aeV+J?whabY{_0_*jXjU1qYuqvSXI- zUl|zgjLMM}iFNeTr<9N{?mLi#@Sw z#L?wO2xKhKw*4smxhHf=?KJSeC&ELP+<$4EI;9cD1t#1Z9X#28Kd--r;T^48tj6cP zB5-TDi)|U|06lxOCXyECQ&@7b@tQyHz^;6c_{#oWXlKlgzZ?CVcZe5NAFQoVATDwP zeJy056IGAQnkpoN>|t&O%g%njvm@IqSpCw*Xe$6rMogs}Sg{i-ze+`-0UcK;jI;zU z3L?mDmGFcqi?hwgA^8UEIh=2pzOv&1>||hA004jhNklNod2+-q6fiCyrd4CPmqmi0Z}H{;V?3C zvNsdu1zkQH8H9Mk4B>yrJ4LGI=D!vI;t=tR#kY=eG zhC!PBngkFxTiEoyN0Yvn)+u^P0F=1#F5)}yuPxuqmd%q3x3SoX9FHxc^Fq|!(?|b} zkzluG#0k>t6$diXLiE|he64lsrPiCX-O_<0wkgSG1R z>m+)zyr46shDhbpvWa1vL)Va&41yHFDS$`07&qY;j~}i0!osKK566%Zu&w2VZ_lZ>Wmmhv5rFmm2(|Z z&CWGGsArQTiQ5LDfuTxE^GyCT;5yoml%z^xDhPb-YR}JAJ6_bWw5@wELbp_phRF#O zM$!Y&+QW`3f{FH`q@>Yp$~d)iSJPanCMtR0OeTIyD2xn>@H1;WZ}l9*<#*>czaN`} zkit03j*A}w{*sYO=`VJ^7T02f$GJDFP|6eFPkizGvFrQ^!{3* zhOerP$Qaqq2g~JeFC~FBp=U>$)|xu5w6u0i7EjFznbG*UMQ@1_w=>ur{Zml2o^ZO2 zX+$o>_C*F(M%+hyp;R`Jh7(H)Dizmo(Uf8P)o8^<8JTpLYbu{A11*Anf4Bh-ZML=Gp)h~ zi<*JndTZgFt-U}P+i-)s#D7g3_|i(w?u z#r4nBYlj;i$X0i#3Z-gIIbdDCbc-Fw&F@)-opA|hf4+HG_Rq=mGd`um$-MLFi=%f! zXLTR(izyKHxUK?$%6XH586s>D95i|D?JR}4;yoFLrq&_=$m#mqvzYGIooFLD5l zKytqbir$()Hu%fJhTv-1)IX|_qViUjy!pWFmU}$uoQIxmEo&b5u&pK$Ni!E(Cfkkx zy1i}A!qTfos~f!Si0rX3Y}&8CSwO2nBv{bFafdqZPgXa*_<=2{_y&jG1&_k2C?>8{ zKr-WbuV?!{TJ39xdDVu4Z+6%4yv`xwhoQa`hV}@zEM;rDffXE90a88DbSV&9ZVuzu z#h)U+S>diV6~ja*EIGibj7%*5nb09E#AL8Yu41u=IUoFPXQ1Fg=4X4p%ZC+6Q5Ely z?!kEs7~X~)KdmEW7KYCB9-L6H-E`@ zGaMLvO=jVJu>>>AFA;ubUb!&dvoNf9`~D2eY3wnIg3nhB>^b5u>~gVwC8)ooP%}H7 zH#?-IaTFT}{xRYg1c;M~rgPxiU^ly1e_hPTMjW9t#B07`e0Q4rDt0lY zpycJ$b9zC%*ppTsaR;9wJ|!%Q#8(1PL0S{!eh2zE;v?@bL&`02-$y(~yhr>Kgit5O zpYf3cn#rU6Ur{kC5>=Z@aG6wRCDj?L-jVw%kGlEKMF41NBs#G^y&d4f4%4EJru;=M zC^K7fHXUls)@=k3?V+$oNzVO=u&mf52~lc;s#^$wH?X<|72wIZL-B($NRDA}=){ESsa}U<%J_?LA%CP{5&IBmNCBsqqa@ z5&u{RfGcI4+E9NN@r|T*&WY~nicJGRMqZ-aPXWY}`ezn$qZ7J^LivSDz49EDJmM1( zBEuF2-#;bads;`8HOzK;AjP|IS~k{Soa4af=GMY9*^KO6a`Oec&;8N?^!6waEktfW zE#1XANBv@@%#8BiSAzMcwF+oL_j}^@8hT&en5w8^+>f;mK|!~-Z1zgZm@&XCr$Up; zcyl1io6X=a`Tmy7y27h}*awhX%Z#hAatrAKV3Xhc#RSwEERgYkrP7zTo&5ci`Li?m zdPe}5En;sp_cjI8p#mC!>9aroHKONH;Qz3xMGMc>l3Em2ioRmJ2kP)c#Gh*nIe*y2 z5MK=>K69K@p;;6F7b>8^o>iZ-Z;B%-1Dw_ypw_Ns6Go+9RCiPRMVN8E< zhIf9%0gqhT*N8vCA#ZCD!3zN>Hwvyb>=W47bMj@H7S#YxhP8IA=-C>8`;S^lR7bwpAZt-; z1@Nw3qpt~HE!o?^1ka3;L}|ke&P1if&a2Gbah$H;sU6op@zLda|NL4cIoaWJo*-WY ztQR$~46OPHe>j=h9<3755@!||YBc}RnfxO#xGTT9*l~*R0FWa*RzX4Q{>C0YMI89Q zi#n82T~+u`5r0KY5uc6s&doh99Eg<4BlcMV;zgTC&eCs2;0ZH!k-=!aOeLQ+7~6Q; zfcd2Z(6IppM`w<&_m-e8HgFEScU-x0$(-|~9oy!h)dEt(2M$*GzuG*rD{HwpfW1+# z1qkvVT+R<>L`TFQ5np}(6aez&6q8wTs1sXax*sB-!U|XN$^)>tGBe&NAoM&fyTEVD zE?bl2NuQ7|{*L&Rkz))%iYwIOCz}|%0lz!0R$7kQ`jKDB@3PHCv@1&g+`4BQE=oFv zfRyNx^Ya$*BjOnGBjOvH5h==~?Nl8y4Zt~b3GxsNuZw?&KkklE7`;y z0;P;bgUosaf4+ZMdb|POQd=>xLD2r)aAvxTlf`YR=)!8R9nfJzNT@POhSR)|tQ<@v zJ|+97^?#;sd~XG`=Q!OWf~pnBxI&i9;x25H-(lYFM?9wY9E_ywd`P~V<7 zh~pZ7%$e#FWnPgTrIq9=?rZVpg&7o9CzczT|Jz$emTJf*8xS#(WWOP$PtGDVlf+w_Y=pXApEOMC(}FC#wQiPTu*q7_{@1KvCxeGGy-{? zt=gH>>Nrp3<^w%4;?@N;((s+N)e)u3YS%YE5@=`A`an*UbLe{3_%FIjQz3|C&KKOM zveTNV~gdvR*&g2sTI4x?Iw zEVmty(3iBn1AOuu)*&t{OFm@)7pv9kFTj4eWF#h=Wt5hIMqEWrl#mLnB%8@ULeQDf zdl^)~1yWaywc{E+ih_1}M}mS0U(mzlz9)SE6T<&D&|gLwLl${8*SoR)lL4g}pVc-D zAeO@f+cgC!d^|1rDi2?XpK1 z+*@#Mc<{_3g=`GqoG859L&<6z0v}TN28>+p9oO>AuQMC9CEr+lvHBeu&sh>d~LyWI|UwW9@zx#6&S5cY8M|-;39MS z8AlrNYGd_K01|gEI<#OMlkn6 z7B32BT))4ddo&0Bihmq=o1~8Ck0_LppudB>%1wgF6cx&Yr`gX)3Qg65@9M zN>=x<=dO zR!+ce$ED62@f(1)qrxz{1C);b-au?gOYj??uBt|=ge)2M8CGKJ;o!eVWYEKJAI$>l zahIyN)!c$~5Do2U?fN5f;IEn>f+f`Kg>qTpS_F`suTq+x*vvKOs_&w#c)en3d>Hfaxt6sHlHc zDsobAM!&ZH!fo!P63r+;VSHc*)pKm|nY?!Yhl?qCJCj zDpUS^S~ntP@DC+NTuLh4?GE^*I9&KQ_m(5|HoIZ~Z}l7?wIOa8&>fQ+^eI8%Q{~6& zW-CQOz3Uk9t6?lvjdmPbPm>o0^cL{~>Z!TU9!9TW315>6uCJm3gNdBvJ&elax343;g z{~)C7SkIS;KaGwW*Wcfgr=39EZ}_c@t=n-e$F=_HVlW}Yd(6Q93F=pIYr(-yfVz?@ zgKBI8vVqJBwXw;c$f4xa(FUWdRotx6nJy6Qz(B();ui?i+)(F^IcX;V*@A-WP`@56 zM2I$1PSR~wuzeLXQV|UiIhts4zUBv_dg?$`a-ApYl&njMC`l55T@rxhG#+S2svzbE zh^V=m*e0VC{rg3aR3@ty4jTb?_v`1nIe?j9(e4q3s^XLr1JXvyVd<8Yrb@UjG30Jj z2sAJFvzhS3=BtgiWKz<LKc%oUoUL3oPImWSCbNp@A`SHF5f7`++c$ z0?YMe7P$9(uS{hT#8ve=a7ttn1q90;G2}J{&;Z;f!jzcQiA5Uu?REj6#~4n8@`_%P z&IU1G)*PBt0B?A#9pyi1Fwd-8zZnCiUd-$Yr7PKCTW|bL`WhHKcuyX31({_=`wR(u z4i5T{h_BF?9TJwzxc!p1HS1YM>eL9(m9ExHexlU3HIyBPPH67Rxt}nKtF@^t96*2` z_A9B6YR7YBE=7PYfjQXtP;J2v_|%qzkp=({dS_6x0I8UCF)AqtSCz!2QWqO-B_(MG zDUxQ87gMUs`?sVIqVsaw$khJNRv#FbM=-|(I|iDj(M#R746W{1VIgpbq0&r$lP*^IY&{wh?CtIUM43m@T zSazlIIvu5*!o}jvmLof>jnQf>IUJ!??$@JC=49zJEOc&f*xSPd#)y~X2xjOv56ci; zLYYLw9nHW$lFci%`VFhMrVny z`j`1;K5<>In?<~WVoOCoZf;<2$K{NhG~~3*?e_%kl8<<|fefuN?6>!;jW0I@*+3zU zNTgpVamM=og%m|JtEN*DgCwC!Gk_of3y_tzE+?d22iu^<0e*n$xWY5Ja$=8%+kX@Q zW|;N^Tgb_p!X9;3bMxsa1M$lx;)gp+sNOlxR~sxgIfB)+l)}oq^Mzco=z2MdbR&R! z$9vc~C+|KVm$&@$C1qXCe|p9(2+Pb%P3Pbi2J%!I;8=4 zPo?{coX)0~Ml7p5v{-o!41@i+R)#S%7fL#1KF92w>R zj`-gZcM<=`N6{yW|shmt(}=H$*gT5s^tU5 z)0cK%VsSY}wdP#_hv@;3P39(>>M7hx4|@J+m{o6`xGIGXtog+@f7rBs)ef(=U?XW+ zFhj7TJ$2EDmMu+cB7gS&G3hs6xt9$Pos)-^%XoaHo>SRl{}wOBz`;gPS{^i>&{+Xw(7{zy8r%*a!D)dM@dSb~uN zkP~Z56GhEib&s1+a=Ezw&MH==UC}(5qr~mio6~%Av@>r=_D2(-XB1XCgEuJvK7v?A zdmiR!r6-X}{rZ{Ir4@$;{;%lBcPQ{3-QKs&Fa0t~c3CYj9j$+R7-BIhw>Ug9zuLWxTrPDErnm4 zfzmo1dc$5p7(5yswLt;p)VK=pPL4l(WPp1S@orqsw7zpSdP;9y(|tZ70+`kmgea6i zLRPab1z60$6!DpF+Y$YdA5jyvD&E=p`uZ~VDLf!^fK-%$ANDbP}Ju)v2vL`yX9&M(+wGG zW)yl$h(6#sBn-jqzjD>m1gLa0eK-qfM7S;Q z^}r7cvdfOuUj%UxV7-CZnx#LHTFnmNQ^bEUDJJvxze3k$*!r1PfP(s9%_Ayyn`d}n z$M+b+tCeze@hz*mIQ<24W_&DydcWuGDlAi5m{DPqI15br zLzEvk9YDb>9C1#w*!Z3WyI539zkINKW4^rSh#rrdSs(d_DyGToNdmOmz*d7y=l>V+ ze_-2ewF48m;B>H@h^=wmlX;tA zvj#JnSmKPqiA7cUeY9ScjT+_-mcHY!-}$%7*S$R}YGJ=`aBGScS7LQ-2duZJBrO<> z9o$xLdW#vZ-0`6kV@+=Nzvu%vVR{3UWMO`%<^v)rLP`a2XOj{$)=l!No=^WmN5bv> zB^Q0ho?4!)am0*fm-3iAkzKFRsB=kS3l8m`Gkn>q=G{`*z+=6cp?A!=3ya+w-#>8N zg~rsm)!S_JSigV5xPpZpOb#`Nu_rL)Tap527{Jb|P7|cq9yh!vs%;>kfdhO%?V(Ds zYe9M|5vCuZI+jYVa;5{vFh|vRzebR%66`p=L&Og_k0V*M`n?wU|J7vHJwD__zE$GJ zh4ns+0Ud8s4NLikl=YW{2vY?%2tYX#=O#c{VJP*FM2VVnDvqqB$fv-QOc>8~a|<$H zrjUAV=6JvjYc0m4jyafHQ|dLc&Vo4+|zW){gB*QwA=Vi8&a|AhWP9hVz z{b<{gSrd8?@?NwlfiV3@$u6t*Q)31%Jl5g12Lnl-w5s1C zIvP6_atr?NXVphvT`tv0Os_1#KN4&i9 z(NfhFp2dnA8z=)PfVD6x1&%FD54(7&4wcalSr`i^&Mh-iE8W7d=DX5(o!(ke2K(d1 z=^F^n6}EoINs?F0|MSy4MsDO4JR=-i;*th)wF~=~0Jy-O>^F6iN2K0I+`aaoHhWm93W9c_= zpi-iy7}tZf1e9WPAmn5jnzX}G_k?}!28A)glFG{$bj9v5?3`WMj-X-RHh=k#h_4Y( zaG>&#Kawoa7qBDAezxRiXG{Vy;#-9>{{PbLH^EXPhy z&lPTBc5yR*_RjE=HsU_w8u2G$g~gZe?Gfgh1Cd+qcLv0$7WeR!3ZB6j(+u6Q)`mU{xT) z633JA`6_>IBq8#C8kW#OgbdiPq(#+7R+K|JznJ$Q@T!F&gmnE2nl~pK2Q)2n@{8GA zk-#vC6$-9j*|dZ(Curh2;a_zGudpRK^}L;ZXlLzMv3~!!q^DKBzF1B8ouNo~Nen2= zL#U4_vodBy8~M_94X!!0iA5hv&NR>*>R%+GDh7pt^zAZ+)bmGR zXhzmIfchW*I$AcB|hn_9WY}>TjPV%^&1S`NxatC zr51J`r?vf-EYCBe;~p3uJsb!fY}&veI=>xhYR-W> zaRlctJFR25VEzxhL>Pbi87?uy*wX--TZ{ATS$p22Byi3QhmRz1v$Y%KlxeDll2NDnzJ8*dtWYhmen@*-VeXj>l42P3 zze%xquyTL?}ER#+(3ZyuRO(9pJ~56YLCfa_ABd84hw5NdeKXtRiQ zRGI_?Wfmevg7cNq$-r*S8(D86NCAi%!J1O8mMfAWUlJLfu3wdufrPrG9`V^8kyET2 znL5zN30pj%yUk&qWRdme`fKqDn_99X{Rz|bZ-^M?tmjX5;Sq}(E$he)9^{(Y7j=$(49k+TSb=Qigi62c=F%XLXiJ#CjU)1n7IAQ>9~$LILQM z|6Ay{lJzeL08(Z8an4K_e4k_>0XW-3*!~uAXB4Q{eRk_N`{K`%kvo+9t;NHE+g?#X zleJiu^kW_o_NlQoBgaZte`Xv%Mf_?SQYkL4C@1z-5KK8xd7OKbTuX%CGwLa4$lJDx zMpK<;uJ@h}=m{FJ;A~dOG8}O6Vh-k2-rF@6Bnqd*0jS`9Fr83O0i}afsB90?i-#oE&jBih+kL?S|lL_aqqfpwK}jnJ=b+!Kha&6@Ct{ls_#2o+J#c| zoek!@eSc4}y5QL4Z1o(jo|X^7s}dYMtAu|6wW$KFcBqTp=6YF@E_ML&O^l1& zQ-?n!F`%arO(41<4DGOviU3k>#-N_`TOz%|*qO_E1X8jES0<_-h!{POkLjS-OPtlB zh&^&JJrC-wxkVCr)p-^GuH^|hch-mCLMBz8W+CE+3Q=pWHh~lRzX3*jM{>TfpNV?y z_lU=c5hPt=r^@EeR>9t2F-L}YNCscy-Na5d+s_2n>UW^2!&2R#{HtYdWShi*d1bx; z|4Ngi^fm-x8-Yk`73`95z+ix?vuTtYMX-ui2ZjF(UtP0+@8lYbi_frig;9&o#pDMnM!%XnTg|C`JCXeAdw|`Bww_)gFo3Sm~(5v~D0+uKUi~ zT}9e=_C&T05ymzGRsY|fFD&dJmISHC%~?pQ1mxt~_tvs{gL!UBl;zJR7Bfx4PV zbUT9)PbE$HWZ!Wx5SCis74Lb!bTRW%Uud(dHxZ<8scINIzGoy7zL|_|;|%8 zhn4ss+k7NKL9zMD*q8y%Pt|l)_EH;G`YL|EgGJf1=cSGJ+sGhiNB@PPg&IiFz}uhL z(GvznLJph8FBM#2y2%D$7NF+L4%=e?d$eil zx(RDr%m`_XNu?JR>;~fDYPFu=uY1_Jt35MeWBt2`AI2PY@Fn+LW@mb(#VV8$57b)v zWgD}aBT52Gs$tds^^m9XHa)bxOA%hT`v8=7mekk=x)UCt9p=CJ1TVCp@kbK&BXL{; zFxOq3O(V28z*qeH$l!sR^DtnWbBW{Wqed{L8IRn^j#Uw>|7dmC8-)^91guFRyH(fx zoT>c@!+Lw=rlQH&l8(UoV28Zj)KL0WgteeZk``8bE?BkO`gJ(K15skcB&7+yx3sQu z8=lOqcNU4H3e12-vwME}CnQ<8jv#^ z4EK(zQnEDa|Lhoc(u2}QoOwa%Wd|czCU@Rq9+EIMM4}e{R8o{Y@jp4vM|I?hi_|27gXwX% z(_a_TRS};fg2M9^0FiT+QyI1-m{KU}6rMoyJCe9_vVZyVntL2hPpz}T9jK4y=WEdj z*23Plq#z=o!w&s}b%_4Ffm!nU_lPG#&c$RuK>%p+A?YZH3+(HoX?8@-6x>qrAC#1Z z@mm@EOJk+TjMVs&G{c4I_6AGRI;skV5#je_0WgR#qa{&``yZeeo+3sbx>zG6c)%!fld5iKkkToIRYm4J z_`!w}KN93_#`6&Ii)sGU4tK;OTx`llkz~+=;njP#K&yBpDu&^#^p(?uB%XG@)F&L6 zO60MI0R`^9LHc|)B=X8eW+ug7tl#Wy?o2Hcgy9@E;Lec!Mk`6iC>AxJJ+NBu3`s2T zEp4yCIL|5gah%QMXGmg&Q(u~~VUc*B) zddA_N*C2Re7e{MW&y0xakhE@75MiR#j2UR(Q6$tX$Lt>ebS0hqRzcg~{GI=fc!~Hz z5>ny@hBX2fGb$ymG?0_Exit}gi}>v$A;-L0@0Ln5=t>??aRo(^#nCS)4Xb)2A>G7D z-jhwfB{h%{f%$91FZMX70n^Iak;*)AYX7}eHZF$aJFv5A=ezYd;Q+zjna9hPusEm0 zE*50LLHCv{U+7}2vHvTlD=djkUxoF7yx~TVOS;<|!EWncvht&C&xq&9n1MFJFQh8I z!H`kl?Z#z4c>$8ykP+*p5>H8H_-rtwum>_N0%%7DRiO25xaO9P2*}fWoZ13Fzj83o zR`k511?`OwjWzjnGsAgJDv+jn@}G=J6xK?Ms6Fc9f-Gt<4(f{az9tfy0MUKI`lP^} zm-{{9e?{aGzZgE;@IfOuwnEi)wBb3E_4D>?1f!mv9ExpA5k_3%o+WQSBQnX&T?J-)#j7dNUU-g+X@y%`ZPiee&F((}K)R4C z+2kqW7u<;p%QyNE3bK2ZJ}N7vO2yb4nA?Eg^ZoB}lwrz}$vk4dzW#~&$eT$aX1&Z0 zB5q^zRby?x-o$fnv+X1SlQ6tYY9LB{CmzJsizyhd;mbU>LnL z0oyg=FN{$iv{?LtZ|m46)jod#brccbdA7p_jh@?yDN#N&9L&a`%&+K8ZpPqB&N+Ac zoeAd^JO2g+RIDAC2&1sQ@nXijV5x#US;CRUpHG2WQ96QIZepp$&CglqGmAgxOWz?? zGDacvsDo}4K+YJyiT5{vp;s=dHH}fTHa_{Pl8bEFus!Q4YVf=+CCj1A5wk{>{R)59 zV@?+$Y|3FbwAkiWzP^$~bym9Ct@)1tw777g;9g2hSZVEog~^D_jj?@K3`ssP=N9%! z4Iwk#`ey8hbyYl(tt#A8RQ>8f|Txh||1@@DzA0x0feg*y4h_J@c5@D+F z&|(QnBhV{u{zt@j(n@hrr*-tmXf1f4WSNMNfn#5o2?a=WA!G`s&K#(K7p!~*+$zn--*$en2R%xt1G#uc@x14{n>B|xQLMYe zhdgj@U8E-{->z94bsmX^&n*01?V&oc83WYfHsE6ROAVXE3r4PdfaGyz-HB4r=!9^aEmUNQh#jO1cxJ3Vt=d4iW% zoMAu_nt_9MW%25WzeDPz*MA{vyc+6}1CkHfuHj79?{NVol(35#tJ_-xJc3`+3$qJ{a}Q37o7J+a&h(@q|w5~MI8PR@l$)-I}Qf*>2$2T06p#fdqy zBcf)+NL@EOVF!$M;GVDKG39cw{C*R@cho9cI<_;*yj2y+H$kk@PqFt3_aYJsKIpVL zk8Uvb`4v-ni3s5HXeVLYepRMDUydgRCeH|%HRoJkH6);+(oLTP75_I3mYB$_*mB)tt$=zx;hdKzuc-Zq5( zl0zZcSZ*7+@zRkhIRdW{e?qDRk$ z-E&jh6m*7vedo6Ct&>|#S?X|3k8p0>?CglYv8Y-|Cy!GGqOk5^{kpK)cTk?#`4PNP zKs8%8nwCQ>$iN}h+ZNtGW9eSV#cCs?&k^6PK%Q@=P?hE1GgA3PP3em39eDCy_fZfp z`JT$;NR7>a>ZvcKQtmK}@s6guq9E1V9MTQ_jC6CkdAZ4`*WB2(jUu+_l_#R_fxz!2 z>l$6wq7ot%O2bx3hACx!K#lbgU)MV5D;*R^Ht!xcoMH4OtC$vWWWc3$irf(l!ZPYT zTaX?=6>&4xLa4X5kQ!{nl)PV2PK8U?^Y)Bc@iIWVg*~Kq08ujYRhjuc;nUmi_B>4V zj2nD3N#X2qr6-m~WwHKmR1F38-VD}veAb67=#~0RfSWQC{VDwliz8s^fL|eK7a8EdQEQ-G0=G5UV-8!C z*qwiQZx6KTxMoS819ocyKyAxFi{%))3M7yt2ipSxzqyV=QEFr%)b4#|6s~-x{&?~7 zBJyN;aF3o?)ZUPE!QKLZ^kK4?0viyTQma|Idjp(7h2C1MRDlD$W2lNk)Plpx_^O7! z-Jo9e%-YTZ&TpHe2;>8H#xu3J#CL9R+??gpE}!Z8j_b<|-@oy-{n`O4gSwKxDNJ7p zAQHKjiT4W5=bhQTia_sHxtADikwi>JfwbV**}f}7;DLu$7P+{8>Fy+eEE&+FYS`Y= zpOnMgG4RGadMYFQq|kg{OMukKa{{d8fc>zM_E(7cebNV@B3{Ph@rLCC=6ZvI+kGUW zkw=cW#B?=BM}q7$i_eXo86$q9|0jqTUXb7Z{*qq{b^R z%q6Nz_kV8>136eTZHc&K1W&%Mr^$Al|2umol1#I(pm+JG4`2Ws9XVvDh_4ZEJf^Q@ zcPDtA1&Po>o)#=`Wz^)(&Uj~J@$Ge?^RA*wr<`UoLrD48g_x`5e0dBQxxJT&uMx+H zU(6mBXSlahDmlV@o_{rg%D6^=8LC_HPZLl?<$@kSxxvIK2A$2(%**Yq48l4K3Tra{ zKny`kI=p9qTF&aUYZXnyzF$#$WR?XEbVIcE(3tc!PzTRBD+f+ROM+Kett;g}u10{0 z?#-=b;DTE3jg`JFp7IhY{v7N{!9D$l3U+~uQ|Ruf;41$Vukn_QA{Z%pBtk(tcZZyN95%H-_2uo6mHw4urNJ59rcNO6W|tjA60+CWZaAMKU$7_jCjDs z4c3Y=S~fea9caxumjI0n^q>Mj0Ybe`ghU0X{hq%&+vK4f^!N-rc1+H{T9g|AWQpsq zI6v}b`-B1UBXs5zjEuI*Z9c<=JZhha07{6NtLgu8$g{@a|8=vGH8@o&V`VI1WfwoJ ztFNWszzGmP0i`qY!qK3l8j)@{FvHi!h$k|%JBlzPUOfX@x0@;8d^Z-E=_7H3G7G;9qc2)*x@YtfWlw)?-Nx43QE%0TJ_fzvt1F|}8@>dfD z;y?MnDzop%4qNLBOZyN4qZuG-RR_AVx+jSBT{6)}6(otAR4jiY80{=;*p^WaWd3)U zO|>io)LdhmumG7!lS@0<4Q-4Q>cN_+Q|ab+w%|EO_h?l|;gQa&gy}2e#Pl3jIi@RI zq8!uW62u`l=JDsB9IX?xw6$1JFygQF_-z@Yi;r?3N%_(B*VZt1CStU3CN5=ItX~T# z9mgxG^^~aYe3$qx8X1whd;9dm^|(S*ie~KY2kJ8{SOpC z8+?|iy3*7v6*~1agYLNU13o+BEIIA0g454kxUJ-1XYI|})doJ`81Xm$xJv>+iB#r5 z_7g>WJ*D_LFh_=hZ-ng@j4$UBwdYBYv}_ zT&0-I*onn!FcJU3qIPbfV}A4FdXSQq@PV;Gk}H;gvn#PVfD0RCm>$w_k5!CjjA}%A zOq85CgK%TP{QxoS`k-w`KMv~s#)Upc%UBHkvq<<4t`<`KLisyUt z#6Rul!c*LKoR|Y~Tb1<&%Y9vsW0Ftr4f;HS!y7QUF!dq^ioiWYe1Yw%O_u-0pPhgz zL-PI}ePvDP-q|Ui$T|!5Yg-JOvwj5+*F9L!3EW#_Y2nSkjU>@(iS3Zg4=~vy8QBz`gsm*bZz zD~t?&*|#;gXVT`LJa;fXSz*^!twBhJqkD|_#pWo?nCzL5a*vZp5!BflFiQ(l1!>pF z9&RHb8Bk;FSI5uqD9d#i?Bbns;ph(UvqSseMtqA1=$H|0>9AHY)i$stw{SCil^+pB z#IGo1HyU9|>OCb;{9>K)uMsaa3vC#%3zU17{jzL=5CM5u%XnQQJRjpHZcTF_I?Vfh zvj(HuyWo4lKS~=58WEqV@87;(&7D``yDGhJ5#PA%$@Xs%{TgPje8tFwGDf5JdVs-( z_Z-2&6EORCwqOo7k5;`ZHb14N6T2Y*)v%idPyH%3>!7*iYi|s64c57^x|3@>7O%>D zih!^7Vh1dIH^su@*} zjeH{apGi3hoEGUvXAI91E=9e6IRGb5$(XVCB(@fn(P zF#4n*m{$pabE(pZ0Q!hU&ynz-5n04~(TB3fZM2N#ID(BALbyc{Sj5I2 z_%#AZDn2Yx%-=R(Hl5$Bk*LQcj=1p}aFWu1eZ;qj|2N`q1k2nOqcu&nFva2}A&U@f zXQcnVfeSS$E*yAmp+_w74B7q_kwyG&9NvbEO99}){C;PaEFvgAN?#^?eT|{01AQPb z-`S!JYY-}hiPE2zU)7x9jp(5asK!3ah+j={Cwg~cQcb*rlIm()9PFh?k=oyCd9=%QW-K8E7|~C}ODsdd2%IkJA2)rbOPje@V4! zc1VG_1e2?(vIt-$A3s^Mgfhs?0_%o?Ak2qXMh|Az_A^2O)yN9c3~1tML><_5SQ^h8 z&uqOTyiES)0qi`1J0AFp-1xuK+IZcB{X|X1LP<_HZK~NbBfYQqw+@%u*#6}x6&E#_ z(i~`JkHoCZ+B*tc|7rboe(vn49%o}yy3J`g691?PYi|=oUnwt2z`0SzlE|(zE2^NK z4_4UtxDhNf(N|@Fq$IM4DJhcEJ5L(;7V&?Pg-^E1N0KkWmuVe-OGfzQ7@gbw$R0fN zbg>N(;kE1Bl8}sBdm%6_D6GkPIQK>s%%|iL8+rx!bjbmBA2Qe-^~>>v4K^@sTK|<< z>wA2N|F9yADM!!N2ts<}(kxlBgUK{tQ(}Euv#TX&E!KM=4?K_@>^EarVeIL|ZO?nI zQl7)|+a=`2)|CVeY3JvUkgB*}pPnGo3E1z3(=!1H)XaX~v>%sx6h`_gC&{eVm&TCM zlJ`9@KeMjlg8wRZq44YP$$(26w3b=zP9n{2Q-L8Lv?o)VSc}%GdIvr(wciQZ&KFqn z8l$=zl9D#^m|XvWgy*mfgCqAbjf#>&#cUxRw)L zaL@17XdzYPVhMzSK5=r#E@w7X8qlxFI4w1yv{XW7FI1r> zn5eMcM|@BG*NC5ZW(npdpfl&^0c=|G!`tGOegl%hRaR#0fls~tJar&yWT`9>bP>1e znAv6@^t|rBu(D@6cN6C}$Ou2L893F!x<9AkzeW6M#9Z#zf8jY-SE$dCaYBza z*!@UUjE{=Yii}5At}tXU*eyr${+3xX_tp+F3!+<;3lVXN_zH--eV9r^gYoeiV$j1| zN`I1$W0@>6sd;QRqeuGNb2`E{blicvRjsFCQO+#hWYfjdMxp`}ec*Cb@j2L-?JEa3 zTRY&<9Kbt2-0^;G_}xbtjL6+bDh8$L=31O-4Ldg^2Uuy+fucjsy%ZJ$T00K6R}a5F zvF#&HL&fp;h(9sb?Or5tla)c%U|sGks-`q@#wq-~n)W;y{N5X{H6>sB7V%?6fsRHE zY0V$k=hsy<3IL@A%BxZ5?fT719|{Xsd)SNueZ|ZMToMzWh&}_^%^PR-)kdpdthr_b zeF_rMX@1O@mwhx3rXG~AkA^+4KV8V<_!2>F?}#Ng`VYS;~Q%pFLGuC|JGelhz@dWnEn7W)AP-gO1 zA;rNs)Qr?j8S#b>(>4^=>>&2Kfn(|tG1UZEzM?w}&T|7Dkr{#gA$&Vn8|8qP-M37u8PHmil(#XdP{x~-+)_kR3HZvHd2BoPP0$6{6h}^SS3m8+te|?FF zh8{K;=0$*X+c3Ats_vhSHja2u~ zh(D7u){NADV0;C*>y4n%5G02LJ?avqQXZ_?Aj~!Fp|*2tNxr=bGp}c!;8dget4Szh zts)DWJRhmI@35%WrmIPw`hf~)q-W`gJa0gF+tcH5LIZhC44@{}dgh%C&t5srydSrNxpzB{J9v?>`t9QJAli51UI|e{Ry) z&O~>}HYD0rKjN_IfW!f3)^s?5qpxV3?m#BhdjH>PxmUMaX`)fFlg}_E3l>sE)ZoT0 z3~{X^eqZW-r%mIj5$y>y1d(LmFyBKhmA-y*;4_VUvd53;rfOo~-R4AR)u>BmO1L5< zBb)gZInIYK30_JhT#XftI>aKW3FaKwy_KHMkQXH|F8?U{%qhaB~j$b#s=t zf64^t3yh*#3eig3RF^n#RcZwh2HrET|KE5R#@m_#9B|YHL3bc@?+m&$YYtr|Am+8_ z5pri}x0*f@RRY6K5^cuoJQ9q zmwbQ)8ElSXs)H4QI?$)kaFDy^XaP$Y3^_ z07@$VdyC$qHMPC71YvG)@#R+>{S$%ih#^SDAWT)PZ(+2|HD9`JxyRfN0Fpp$zvjXX zcgQouqIK3YHPH|Lijpzy+IowDiwq=8 zwpLivrl9zRn`$;6X*g$x4`!q)zZaNOww~kQm(W-z{|Vmvvw6?l40Peia|~^9x#NbmJz-QaT~`GXO>rHzxV9f6K`_MUIP;Oo4e(BLf>DQLzR?g#Er zs`Z(L*+XjRJsSu}@}|!GlXvyirY)_LZ45%>&d%K#5d9MIzap*?pCdlQWTzR#JHObG z_!Y4483^PI0iFh6d7sjzub=6S-BDS)awOlq6FIn;xtl$U_`KZt9Fr~?XVk2Hpt|5P zAXgLd_lV!9qs`1$n!Y7gc!wK0*TYVZ2_%5fW=LE*64hE99A z8NDV9wneIRfq<{5WQpBLNq~%fx)QMp_J5?BJMzQYrUa)Ho~k5glTMG8+>YGrhZNA& zVBq}Ml`(g{4f-9D_q#e`U@xwGVMjgfosO4{PT({2!NnNeK)lzmMhnX+{RReuG6hu6 z#r)2Jbok^qFk##SHk{M8FxXtv0ZQw*M&LOfv**hvb7_@X)#0O7>^b@UP9pwb^0cs| zs9)!g8Q=3r8KUDSdyGz=Xb#+|61B!f)R5VkRnvEdAk}R5VGIaX@ydytSh*4cbiu(( zLpEWYN0Y1Se><8GUXnU~#sQxD72fU9)|m|jc8Ifq;0gmZ8aXh*IjF)^Km=?s;U#xC z+Fad<&HI%*{kj%Ij)qJIR^h>9H<_{5)g}lHDp@)HrDH7%n=Ye<(}fB4gtd2@D4kW?begE^{#>Q_ejUCn>(dYD>pB>#vgc`!j{is1qepD`|C z13>1>(Q~~O&P!_m?l9KW|C}PeA#p#j;kh+XOW*Q6;;-cT#fzwqNB_{_%b9@jh6yZA z&JG4faH%pKO<|T38ySG=KqbzkrlOs*DmlmNmH)dqfW1MX3*EOpGY$9V0P4+VHc5SI zg58V}ci@QGZngY%S|N5dDE zY{%}S`=2A0n#^Xqy|XSRIgRuZ3MiP*c&rH`;}r22@tFWR_xx_x(Fu| zK3;7hdn1yzSulM>X=#Ss?HKXvvKMZ#`JIhp?zxZ?{AUlBHSxzAn>VvC_t;hy@tMDt zf<%Az*~?%;AEi|?3VVe56GAqXrOUPb=jJ8neCbHyU9r2hL8r8U=qjclR_<}6IbbeG zbZP;0z*MCa{Cp};5%*a5Xtd90iT+~!%VZ5QLkE&dNXp&e!_y)} z!QqycqOdM5t%o`UBgtY%8(uPU^e5uR3EsW7%Sm5_b#aY!EPl7lu>*UlRImVHk`K#! z<$P|2Rkv7J>H~XwsB_ER257FQ6gZg>BZa>b8Hh<&f4e%kN2-JmYac!#KA4ldzc9Bu zT(1yZ1E5V)_#DZ9u!~;UICHkP;j^!qNWFrmzz^07v~l|P5nsS=VN0NN*d0K#&Hyr2 zJHu&K{CvRZ4={2YI&K|~-YT%Usfz*@GHjS$%7Kfd9IXXQokIg(epv@_m#Q4>4rj;k zE#iB`KbB?v?IUn!D!K!Qa>9EQCTU$zC}J2hI76v`W_8nvHO?qcY`F0T{C;>G;J zZP|bR@(Ss;BWG4q#bi+`Uk|eI@Ll6vbCTi=(yc%j@r`ckn~ZC)(0t(wy@=D9614pf zO3EXX&`v0^cI-ieGfu1Mg0GV?H!7&g?lYTLv{0dBR<#k7s}a12IT7`mR(_`q zYjeP=#VDLGsoi=SsppCMxII~X&WM8A2GpcAm>n`yanO^YxdsoHAuCR=zsA2RzW2~M zQonX!fJ@?ls+C?pFtr-%zq0rZf4L}Q%)N*e6Mb{;XaLZUwih3&m;z_4L*J6E4}5yQ zK7fTeP<=`P=*SG#suOoyUqXh*$k?Hass2l{fzwAIdPk4US9=ajZj-h~Dl|oM3*zvV ziWczxb6=8RTuElttpQZ9ye(j^w}~{$@28K-Ak%U!Ca$~(61>x1JS6KkaaxWh^B?%O zFm7Gc=;Gop5#J-8Eyw>6@rPCa9K`p@<_A`JZ$86^8Hk+e_9T?&Ycbu%@r$e1kNEdZP&{)qS<@i#p2WV~Z(17^py0SI^%=5e2iCnYCj z$4aIgdS(cqvJuD=mD%|K?KqNMY`x}E?A|m72TT5T$OF%W()2a*_ynbW!MW+J*IJCw zB9SV=RlQYOPF9alm8@BFkW!%pog?yl8L5~u>N6wR)AcV5AlDWsrH_{cV>|ipxe(RS z-m%y(Helz&6k0-z)|R?pT}EV_z0oN@U%jaW)iyT)Xev-^#{FN{6xa+mj+pI40BDx7 zGkK52668=lQs57K(ADOP%pcKOv*XX3mxSfYg144I1pmHUhmQF*@Ksk!yn1x#zw=D# z_y|5{maJsvPvmP|IaHal{pUxv5#J;JpdC-65E^D*1VWalO$V%eZ!Wn$06pY7FV-Zz zkv<3nD`F*nSdX(7g{WqBl^gfer$u4En;dg-|AW;~!h|zA|970<#m@cwfcXr)AqLDX zUgj3fE<8HnmI8}eKXF!SKfUCf>CM^ZM`#-d>)d!nw5OV&XVK}!cl@;rC`+5sqoaCS546m! z|3l)Lkt@2|GhH$htAxMb+b1%1z5{#gz<%2fY~paWORaCi8^n+|eBvSl6FDrZ?7~k* z&dmg-d5-v=6wxL&W@lnWjz7fX;^1-++$ceHBsD{Wo} zklgM3W@I`C8=>`UTFzrO#AjnT_2BnKX`gigzB%Wf`6vAU;FHQ^C-x1*8G z=_TX@3N`I}z8u8Ehvb>miz4uWUry|#ntBA;I;$?HeoATRt7Jus&GcIvS=+tLh~va?h&`l8 znuO-d=r6s7=<*KzOvceJllvEHo*o>Rqi?iJ!btSV})nhH(b$XCPwjaDdwdpc-Ohv z`%xpLj^M1qdx*#*-XdNi?s+O0EX2uAqGPF&Djs}pW zfxBnIdu~I{Q~}G9@2~6^LhW+h-X$U5eDf)1%;jVNy|(zZX%uWa%|Z2*^YS`(*iy^s9M|@zy!bbsIPr~oir3UT zE}^kn_0RV=5sy3qQ1{lOtC_In7NTIoe$_Wj*oYW4r%;r!=LPO$ zmO}hkV~A{t^2DFqM+7loUWFg_F$ZWu|5OR>ZublK)r+MVkQRPUOj+Ss3md} z1@)HyjZ}6TyM)E7LtofnE2=DNf$otJrU$UFthRuKn|)Z#MKxq28+{lV3;V`2hMIP? znhA=9A?uAWEE+qvQmTsBl?4W|cnbp#>t#_3o{}}4u$DzhWqfB8GS4WbC3h|Ptk`sb zVaqmO+at+Ri!e5`#>_{45`GDtwEzbF3OZyp=;^`}M-~pCFw{JsoVjW#Of4Vqp9)Y{ zqMtNdGZHzOCL-`ni~aW+(vOHAT(c|JL=D&L2MP&5RhFZ@jd+s@W4L0M-mxe7!RcJzKrTK!9Z>}M4I!1ndSO&`tcZnI1dsahG znQWolZpX*>=-1@#dt{p3x&Sp8#eZIQ&W>czSL+nJp^sYtn1}cLPkV^$z)Ey*^Y=i! zDmlQy>lyvV^X~n^X6y)VO~mguT;ldp1?w8|!hMg}&514PZHE>PEG&#%XiR>94E_r! z)hD4R@Jv@`(Ws)XFfwuUN)ow^PS%{9Gsnf|*39PvX10sKJ?Q#iyj^C})e?VPtjW9_ zm|Yp?Gv_|{R+|XU;2}YvBBx);U{2Q7uYzE}6mF=1R7x*k>(s-n77Gz{PwU4z%cjrV zs=C7O038`PC@q3b6leC<@D?OKb53+YQ?F45sr~eD%@sofGn=A&A_=)}+TTEzZwcr5 z4N5rYA5W(3O(~R}O-fNyrGjw~z?wwD8@Ef#;7p87Hf7%RW@o%Yb;l@!+t<8nh6FGez8(>30U1 z6jM{{^V+=p+3WyXfX06A^e=k3?<0Ze)~!)Vn(-rOcgdrC^ zUUUJ2Q~>FDfMOU6Q*kY?h_p|emoOoyzkoQ79G)EY@@R$D#rpTCHoQbKFu|nYjO@rp zH>#Nj!i6OJC+MUEU$%*egSpGI_%0NqsJ=0PsH&|VLw0_u>122(LRrt-F z`LM|ZO8Yek6Xa^S$RA5(`Dhf(b|-LcMH6MTjnRFbnMX>&Og2MK%t>u>qTK3Z=}Xp| zF)Ao*fLUfn(tjjDm~13_jl0}K{^St*6FmQdgY^t-Vgf+9eDI2dG{sV8b@Lu)s}$=E z0uUXI;Ua>zD@mD&5FT-iC!E#o&+XNif0=*rXhAmzwGO7az1_bG7(Lt>V8%j9fH-npwTa*lNfJ3SY4TU-p({V{$>}Q7bZn^bdiF0!!o_^eKb*th zw5ef~3(3Jsb1HKNNj(5S3uA;A>!*H=`PIVK%%svUrU&01f*}|p8Fj0$sMouGYZ(?NmlFT^ng$t-`80H(n>&+Pl=nW!9HL0*_;?-)+x24|$cHV$*4Oz>`b98EgNwJAwW~BRy_p2Go zyT|Bfmv*;yAYoBTXg)L4E-p{8fAReyZ-gDgMDUG;kt>4malLt`h`-RBDBs)hLGt9? zx)U?_#q-h!?aXq7<42>X7hnEI#8>3=O2OM@^PE`Bjn@S55{<2b{7BHBIn>8Bhprmh zK-{Tl#L-7T0$=-e5-S>79P*inli6>xh^Ng5A_c+e1pj)p4DQ}4%W4rw)9)V?E%so? zjO%Zg|JGQMMF6=nL876`cQMeDp=v#E?tfTye_oh-&f-@RkG;{9X%Tp^F`M#oJs-LQ z!)8RXMIXT&od0Vn8|>N6IcR+`9l>tB_+e$7p3Biqs)m#E+l7`$+gnjO*%@bB8j+N0DV9 zi$0>ju1E_yjC6xQ4GiVMXu$6gZ**140N&B%EM$ZNUM-WaKe1++VSi=%iyB6T1glyS ze$$e6>z6jX@tSPpLW4h>&#*+sR}-=G?-SmXoUA ziDm(ZZ*NQhLTyD@Mal6XHS|dtROk8&+m~7Qw6fc78-2q-3s!Hm+LS} zoaw=2gB^m5e@FZ?c`B*|A`(O>_o|miU9L9wZ{Bgvg7kd?Lw=*adaw?p4fHm<7iAq2 z59$q2$=-Zfy@uXBPXRl?8dO~T1Kj?^K}>5HJ-+$M%K2Xm*3C1KYb=YvL6P|t)Tg}a z9Rqrdc(gGeDS~T}kX&KXE8=Yihgbu`>g3I`GIb9(;OK!=U;^15*VFJa)J8`$bdg$_ zh1|I~_mY$WCi4}=@~jMOrfBFf0pSXrEk-}W*hvq1T~0vQ@IJE*A2@?$#3wWFj8HyT zfc~<4`&NO?umnlT|BBet-Bw!|NfDI1p!8}Y& zpN8=!;#Y(JqJ+bQXrcg%=pXiUkEU%Rqmn z{Qn2xrh|)nvzG97{Yb`v#*I5ZPRws*zB(Ja0%&PVnF5Qbv8dD|+@9T;&}du^D(OlItIsW&hcFfOoh( zB?l*~)h|p~06H+wmJGDP>~sV9SiH3@RMhlM>k?7|eF+@3Gm z(a(6WnMvvD%@+H~y54hhw-aB!F-s?~*nN$&x z|3U4&i}+s=f!&HU%_%0-YXq$%B2diT0V8s2;Zpz9F-fF*c)EK$(_Br=29n!pdvN(1`X_GsL>pC#T5;(5Z^cJ_#^4hJ;$(LKGvkCzoD?Ny#1>( z@z@SiI~x4X?TVH)U^!kUm!HC%jQ z2eE_Q9O#QnUn9F%MZ8D+GvYff&0lD4nBmo9#NWM+o^e_x_=+jwE#h0mF5(FX{%T#U z#by9i@5Mgi8u2w^i1-cZ^_8Zow~w0nbHHs57ybn$vB1zafa>5$M+|OH+{`UAx)>^0 ztbfhSu^qvrH@H(;0Zm-;&sXXHx})|YXWScD7aH0a1(2Z*J_3cpDu_x%Cw`JT(A>5~ z49R4ofO&7;BA%1vPp^Sps;pt<@Thsm->D=erE zeKl4(eaXHgyw9-eopn{K&aojw9m)5?CXWx?;Du$1O$kg*y-y)Fw*tz*e9Y}4q`G{Ip_fzCGn8Mbl4>HU+a@R{eL2|c9&Tn8arD%#Z2ec2QsYY7B)GobjH zRN%n|L0vv_U>zaqZ=9kxbiufBP68r-?s)m=*b~%E9vd!w#q7m#sZrE~3*^;q(kNAr|{&WFU zHQSK_HwHpV+UgbM7Y-md|I`w`6s*)X_C4ayi0^CoH5t;WS@eu>p)L^dDi28!MjY&f z=?_MPN&Q+8VmDIY9O^X#2fd?VlNfZn3Bb*G4-U*Ilh=mEa=^i(bI_SVhS5||B?(PM z3=$um4%2GLJm07^wixqd71+XLWE(zcCdWmwcBr}44}D}W)6*x0=xRSa5)bC9edgJW zQ&I%WEOdvt&cb+sl4q<*lb&X9EfUKK@NhETw1_%kRe!S*nL^hNdVC^6Z>VsB zn4)uE7>jbo2VIgjdc6vx0Oox0%;1H}()!#y;mv;4NcV~xs;D&udQMdj>U9!WQeRQl zHyNhSAO+{IO>bV8a0PUA*v!%3dq-hTV!K5RsWc%`i{HDmHGeXMk=8&)^ItkJCGMm! z{i57!ngN$w>KR|H#CI}1wn|D17O%3!8?53bI;XdeI5qg)GGM9bN-Bx3%NYoJ*5#7K zfJG5_vd>fb6kOhW9L% zgBxe3bpbeiBfr_L3D95xPoxYv9U&L?;2iOlRN@^b?zVDgSfwN9rhu*-;4dCXWKWw2 zOf+zX_dl}>b3yTJ|DF;ASsB=s#B9e|QvqeBRNMrVE-x! zh`hSsfG_EB`j1HAN0as6UIlS8tO2#KCrV|w?hoxpiJMwp-swd<@Sz8y zTI1(eg&NUbVs1(Ut`jWSlVRH1=GA6`nWf3ovMShaMz64@ULu~o0C=kbNIKO*cg?ZmE7v-w!!yMBzt;}* z9Pu56f~X>?VN1TTTW?)h3>a`AX9R3}{ANxM>(`Ik5W@_0HrgjrsLb@e(?{?_jCli^ z_AZbCH~5B!SiJvY^6i$7%T0P$Q-1xn*1#$^97#LR9KeH#2{WS*QWi`Ec5Z_!WT*

(76V!rL8HBg^zo^Q%tu zNLF~l%ccVr+_+;o-q^wihHwsM(YJSY{GE6%70W9O*`3n3{_>0vf2I8UNc7EZA^Oeq zcUt!0+*1h>9==BWE8?98Yo;vvXrlGK!Sc~4s1w}7#hN}A>p$AqwH?p5md~3Yr54q< zMFt?1;DuYCQ@ay&$Y!Y#y4}EFk9!<3_ppboiTqFmUA%-^MaP_=3u|bX(ZnMg6i^E^ z6Jx(OO@4`c>(`UC0tE1l$A<=yU}qJuCrF93h?5a9H18mmU|r?-ky6yQ3^X$WxMik+ znEagNbxa(1*o;WgL8>+QjGB_KNeSrr|1?%*$d`ldJiKlffd#@_&Ub0_aBHZ2VZB=a z6Df~s*98~9!?D%Y%a^_;u-k#;{gtdWH%O4J;dj6yJy0LI+B1n7OKVD&@GXhqB|M!P zI6G$l7bc+dI~u*08Etm3>0r&K{ns$|A}n9A=!S~d1*RiqvegKdvFsUVQ4>(E z*6WhC(o_Jg5xDZs4fmHz?E(4`i5jL+PB2hFn}Am`+B zpipo$m^Ilz%$9&&uQyO_4oBlqOE8t_u%1O(z=xLOQd<#S%+1AWzY{%|X# zG|~e++5RiJdPkI%Cq68rhrTg@Ihs1BFkMH#IZ8#-IBI;|ND?EXsEwvgUlVW8P@El) zA5r=Ajj<4?bpV>h5}&x7z$+CIZxKJ}t{gXOK%!8^{0yKJNn0j+=<;q2AWB4DH7xKm zq@vTo?WT}M=wm|7qLPB<$Q)U}=k+L(%urSi%F|hzcm&_WT0-X^F~4e7Sc}wh5I$K? z)U6Db8Gm(exwDeBmiC{_x&$^Zp(jeoy<~Xg(FXc1=D^&bL{173R;*A)Qy8N#t+S}i zR6rFfzt)Py0>LF-X%PV@`@EdJI$?~Njg&gBpQolvn0u8=QY9U?FfzEK+;{d3XlSuK zLil(EM|@=dQ73Og?spu5TlgOydIMmBZh`sKq90D$?PD;p&;+tn*9z=FnC&zW4Dq)J+auu^ym zA`7bi&v@)+$@TZR*Nx~UR_qK2y|dh<;~#GX=`@Br0v><_g>}^y?9O>Ti%M^kb8V+abVQXo+P?7 zEmj5}{<`@*08xmYrEETetHa6fZB*UHM}&1BOE4uNoa(6gI*Ff*T$i8GA1M_aKWgL@ zZ8`gyf7|$m4A(6%oARF>vZLMz*9-LjIA^p30FFDp1F&WdDgz2W1Zi`GC+UAWfZPgRGi*#?c6NFC6Ri2ogt zA-bDjq;|#x4^}Hp576%spNXS;P@j@|Ii_jTfNG>)g{_EmXRSr{Y3+cNJYR7Dzj6#k z+mtNOMj1sU#esycxABb`qn$@0!LUBP7ZiV)b)J>WNYeiY>+8|Y<)qWKxwPDJhu5SG zBCz^sgj2ti%nN?r-r zSo6}z{Hw9JX$|B@#EGi+9=~0JA=+gTJAHN7&tx(9w8RDl%))jC>K#0yJylp53^#U3 z3=Ser!G8ynie>D1PkEtS8((Eo{0on+{Y*|I40kb2{*A+*Gx)DOyGKQ{ojn>+FnFVv zao}J)asGG4(@8tg+XJJUGXaSWHcKR6rF*_D4ahd{*1`!<&v_yP3D>FpjT9FM?%lI?4DDtaRY=E#+55= ztCo#Kvu}9%XqL;|g#KVbcj9bK#v+YL5+doZa(yn1i+3$K}+v~_Iz&- zR=iPgJ)l_R>xBiYmQc3+=#iYc!Q2_Ky&7#-(t|ti|3b;D3iv&7<7$s)m!c`>w^>4A z1WDZQoRe1`shQhzz2>(aZ94JdrPR8nG13@KBZMX0U)oWkpThpkOk$c)|DSbrJGoWy+NfieYLM8)Z!l8;bP zyC>q_K_HjJ<{gRu%Q^#{)2v?^qFVtM&&%jv8l#q;|GCNZWKl*g;B2H|7>#{dru_ps76bLm_nzU8yq>Jt+k4DlwvddPO*ujoYE;RYY0B2#WBraxFB+r zV9YhT=ZLq6Uo2b9S&ZE1s1yOpEr`5$&y|J(}lIl(k6S}j-%{q=|r zS_kEdbZz#*|AfX)4?vEENC_IGwt+L3hIk`|AMGh(LC#uIb*#Zo*UblF2yUF1qMacD z5)0Ycc}!nAnc{ph>_oE*-Zxb+vfm5WQgMFvTu4G&r7W6MEFvPrsL1|&R>uYcmj0(P z80WI73?jBjW&y=2rsk+sF7)0g6-}XA*nX;aXbuDk{L0BrZ%*vsjGk0%(rXZ5^nx7Q(S-_h2G$jfmL4=^VAMWQ+tXV|HbM>XqQGv8cWVTb46$MX75ghotwfe=RDk{& z@vF@wIKev>_K<*_Q=KQPcVR~7Grvgbsz3ynS2cr;zxcRP+7*MGGGH=rtPjfFT7BsSK+TV`tW5~^iGMUxRTPe-A&R7}3{ z(0~@iQ;_==Ap0}`&)D}j8qr3hdUJzuKX}0Tq7Iy)Q##t@AN91%Sn0rrUf_Zk#uMy~ zM5_`{9pJ1kM5yaJ1DhqbfALgFC0H4}a!;05tRHCP=U*z*!DQsRz^k#)iwvl?|TBr=FlN3xv0dDa)3j`OT?e3`Bf7URdL63oPOQQl5upTCRS5b z7o#S(Z)b~Tr0 z<`e#T^9(Bt=Ez0{^WhT!K=QV7egDEcEd}Mtsxl97QL&bRiULsS;ApAf?3K^N{c{ri z1Cy?8YQqaO_LFgDFLsz0DZDZox%@xJG<8adh%F1<3KJSK}*f!_Wg+1|rzO;*pUH%9?yaJHYx@p}7} zkWfb|aB>hX3av5o1<&CFmF5Z8%X_}oq-=;Ea+n58Q^?Q%_lO#~bcF#f< z{6kK05r9;Lp7Ui1|1WUfa_kDVL*l;*?6l)-T_}A%6NV*bzrAjaV8N}6?iA_tOoPGR zqVeJYNdp!1Ox#(*QJPNi%0ojJ6h_W7mvYXKinulIA~&qM!NJWs^gjSA2Odu#1Cr4A zea+zp!CKvJ12lAlB?o|5qit?W&_z5)yhc2kB&#Oqy+lmli0glz#`tCpu9<|s> z?eCmew#JNB&SW%RPSvahhp6gsX|^z0I$xPsVM7BKCYTy=QEDqt)w5z3(+sL0=)Pj2 z4`h=gb^IqQ1*i6xTg9cpNILtWTI93!8PFN3Sk(*75+z1GBk*s`<{#EeFoHNLgL0*1 zgJGqDG6}#vxUL3FywU^RKIH^V|i<9X%>H) zSwfMm7jG`sf5FjLHnl>%e3y?XxQV-%`2N9k(*b$)cXGlM08RD)N%uTC{#5#m+}NGl z;a34m>ohFWA5}FEo9JsUARRDrIZvIU7s!o$fA9Hqi2tStG{@S74b9g*OCSn+h|B@7RN3-&7};$BCK`=f zGZZQ4CG5)V9~%+^iD%SLc*k~21V32{wqb)UGswkOg~5*j>~a1`g`y(P558@V9zSSM zD*(sU8o?E~{?WLA8T&o+$f9a9eUWETqj6<1yn;L_0OY6jBMtKq ztLINMHo3D^XiVijtgpY|K@mIah3WYSB_RrFDGe+BL`vY&DkiNZ9>af_M%m&+dfcE2 z$5KD5Qz*@Vuf}g*X`_EyCO*OKY-CWbUTl$#=xTeM&Xq&4kwK28U#dwtE6m2_->Hmp z;%ncTppZiWU+gFIcUJ2pT)+5?f9veP>J;h!X6yCW5>CJ{sj5Z8Ke2`XN@U!<#jS~_ ze;y-_xS%`EPljJeGsr8!y&&#ZxWkq|J5XDwHUq#f5dEs>sBd{9r?{tO?&4dR8${^f zsh$brlP!TZZ^1ywW(p(kkA&nqBNt9dAIz-=(4rs6m;fqO+1_SrcdS-=4TUjVJ$tyv z>2$<~XV}t33O!Gfdu0+;!6)8f24^$F?bnR@|3}0F5wx<=wqcdMPai#2b^h#Es?iVD zi_~$dPc};7_RSr8U2%+GB3>eXCmvmG484Svi&utmsr)3QHyJyilelmIgMpffpIk7; zBR;IOL8WN{q~_>dNazY6+Y2<}!7ORx;&*KF1?*fHe9D<6Uh%}BGzQ#9{AO(1NU?cm zPe>fr03s}0WSp#q7!)|O!B$~|LYb^jX4Y{}41em7cz-7M+pj60cYB1@l~k^#C*wIH z!+|^{&l6AqsIH?;p-{W$&gZk-;898(!NutD+ehT2 ze?1C_ybn5y94p~{@A^Ygl?BVus?_B{Eg;gJ&( z`~+(wQAfW<;PXfP67eel;ff$V$Kz!Lx!igwe!d2rO9NZ*g*dJqZ@I2!4Sza@oHY2# zBh^%Cz;val1V(!}VX>m|W?TIgAJ&l~EfDZzM`W}G`!ixdnJ$TQ0tIjGrXlptH(!{F zp~5Q%VPHFhDNQ#ir=2k%X^8k1@g8wUo6F6ONoy>@(wjZfFSk)$EvF?7z;C2BDx6Ma z6B&O~;0$B!fJ(s99g4pU?fgL9qyQCd6c7roZKgfG;8j#iDOhIpVJsX_*duR7(94mi zI?^WijpqTx`ugu^`3T&_Tf|q~P+*Z?VAs+ubeZ{^)%sLQ)ElPt5@cNA8)*DD!nm{} zVNcXrye3kfS9UJ2IrYWeQ3IOR4tU_$P9*q3PvdGz?KTCdl&7WQksEYT(AsVynCw9f z>KYkr5@nB5*ntEpfAhd;dxc=Q;?yKuslgUSG7#2lAWC-UqT{p4NP{Dkn$2Pluhs{! zW5-U$fz4m)1BVteraSA^bg4#XV#4>tfHOt`+_P?P#F`FWK*Qb)W|--nG^ARy55C6@ zcR=*-5kDZ)79WwBVCX=et_sQ{O1Sgp!vun`urny#-~c+57e}7f_`V*8s{=KfdP|id zEp^TUEf8>K%vXgc?(kbBHH0S&C^j2;l)>#O-~HRY}&jaBi#~-MRuf#?RP(ha5 zdj5UrLD)^jX2=H(blG*+yNmW)c~O z31c%pNaBhVz+GTHj#z>E1%_qSTj(P_zbZ}3(nC$xt`T!JF1{SEty0?nJ^K0O*8r-lP(*-|$*;Fx9)*sgxS1$qJj!1IdL|_83 zJ4zNMNor2XGRGf(m;y@&yr(g9CO<@2iCsz$aEbSxK<%ZyK~|Mvy{Y$XV5_v;6Via- z=kw`1ZyJ!+|BAU*@PY4~fiN_j@>e+t-(*VGU(tM=Fo51zlzx2)Ie|`fxCf4hnvyoW zPe9WBAYUpl>A^Y!Is|5Wp8cM?>!`T}vG+UvTJ7ukvJQ4z{~Y7)nctKzfxCKiIvy=H zZQjDGJ$0mI!~Y%e4QW?tqK?JNH4*pAP786UXzntf{OVu0TradLWZgY}bkn$Xs zO;XBCrE8hi1%!+a(jUf@ z{QSEU?mFj2G9dD3Yy9&i6^XX(!Fss8_Ej#&W8J)$gvupncvexANW@D3#M@@{8z^EQ zm$7C<(C~{*BHQp@wY3uDn994=*Yl;NY={U{dq30DpBW84;QCYYMkb^fFjyXc2lNs* zeL{$Ly96mlmH0hCKAb>hqQL6)G!S7b!QYYSn_;$6SZvOlrm<>tyR5A(4u4J_xx*9BJq@*aE)l646j*~|{(`%&ZAaXBKR7~Is={cmQB#)zr6HVCQ4d6s--pa|G;{vj@T38HVi0-NUCu0z2$F`2uqc<2te{7Qj27Es(R4i zJr}#Efn-!#8~epO4;0ntrSnJmudZKJlEB3YR+QEl>mRzX!OX_5PG(HAKb*b)GyY%^ zzvQDR6kElqdH3icfC?3s44#0YmlG_#FFx%D< z${>77=<(4q_{GH+80I_6wzIvIe7-j=((NkdG~rjK7uNQnJBtTORGW1M7*;XL$$%;| z(j3d*ZK}w6h~axurkXYDaX3c;OTK<0UCnd$?qDI4O2$ZRdfT1`nxTC71<$hQ!W6*^ zTI}y4{z_z6Dt>FD4|eO5C1tg#P!=4j&h&(%oVcqihSeG-OIKeB^W2770ENTWIy0=8 z`Y@H*4x~!D_zR2qDI&;{7H=;2@;wf)1i{`}9&`D~`@6-3307)c&p=XHFDc2N>o0)w zoow11T?8eQe|O_+4cj15KgI zZ9=PgZJ`OO_Y;AA$pywS=mfbGn0K7ruUM_)aK}i0r9xq2 znr?1frFn7HcX;B$VsK+Sy|Mnjku&ywfP5VJ&UdnmV-f>;#(u(V7DfhUNS*Ap!Ht!vRVPGc8Xh?F;mn~F?@;}NN zB*^%M>z8~abLf)Xg2;j#NK%l{y`y3}eslzS65k2*SaKDW4TNaek(7SP;cO55q~t8N zBsV9t61NO!kpb+$u98h;)*Up_ksw&VQ43e)eK6U;z@sEYUZg%e!$gGx-WdRwVYHg% z6Pt}NV$5G7USW}fwR*<~wI-TftYJkRsf+k-j~K13|LgvvMkt-(hRbVAtuR`VAxq$^ zzCX-LFH7Z<4Q~$XDD`#_+v>G-ws;CN2ru6A;b{z!e0)Xo8NKD zXOlW0Dw_BC0u(=)zB}DNb*&eSpc*4yB1Sqy_h!tCB6zVXO-~jBJR9oQ4z>6kE~QhWET#C8Z4+q8w-k_Aafrc@Jdl$xk-|*k}cWon=#KI6<5R*5RO|AHo2#hqp|gD;G{y^ucFmdbrGv(FRz z8))L#0~9XQdK3(#sA^Wm@^cj?V4!Saw4gg!W|F=#3TO`oop!9{nc=1fezHM9HC%Z? zBiF!H%(baNfv}MQh_Dvcn%MuaS!SK-g?`>eibLL(I*TKoWU>zUw{XIl0h&a&u@SwKQv16)>Pzn$$lOeb|tIJBI$ozJAZAnxs${bptF zk4-aAXx-V$mc-m&o2L8uX<+1>xc7%nCLD3m;fq**;mDz1BA?!`U0_w1z4_6nz8tT+ znwlMXz&Q!QwG8Lx=Q`akQx5WG5V*ek`4{cyvP$zNN-SZo0A5Ji=N;htm-B($`gbSa zo;O}_%4+9P8$Y!PQubUA5+#?!UuRvfkvSoSlbhwQ*his{N44zMwq88&VBKtvxTvS+ zs($Godz*H_Wzx(XmWFeuIBdTjD%T1u7Cz5^S~E|6p>SE_xwSbx^M6?b_c^L_P2iP$ zef8hPg|_(?4_G&@z2$Nss!7{?{^NdQnd)5LAZdmbi|j&*i`4BxbfuoEwXShGd%!8| z&qYq=K*QIyn>&-7;^UHb>fVe}XtcYgd=e@JM${DDvVn}JofYKdz^NlIc#s#S7PDv)9@GB7mJH89jQvJ5daw=y=i xGBMOOFtsu;Sef_lFp7rU{FKbJN^}iIRtA<34Gkf=cYqogJYD@<);T3K0RUK1dF22A literal 0 HcmV?d00001 diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png new file mode 100644 index 0000000000000000000000000000000000000000..2c3f21eed13d8087195e16e857b0e1b4ffc31eb6 GIT binary patch literal 4173 zcmZ{ncRZDE+{Z8L+{ZbDtVo0(jYC#O#*ggd4Cioc*+i5b*@_U6kx@xz2<6x#4aeSl z^~)#?TZkf_yXU{>kLSEz_jO(OcYWU9?{)uk;z&e2Run%90KkgX*EWK8jlU0)0baA~ zB3Iyz@#pzzD8306~KQ zpp5~ze&cT>M1mucBnTXN0AS1vKtBZa1HeoIz!S~{qYyL+OI6m z04TV~4O1onx*(_*0EkFN05Ajr2#)da0st-wdLa^QWTv>o#nAwaKu{l`z@=I8 zIr|vHb|jI)0+28gZ~>DLGyz+T$*`B%hy?4ROptxBy0H)JXT}}TL52w61{VFpI99)J!25+KM%@O2>YBzQ%d;2?lQ!qz`95Magv6!8!y z>O>f_0MHHtVDW$F<75J_2m%SdaTa*Y2Zl_5fC5lA+&c-Nj0g}Q!>&#UFp-S@@|)t( z0O|t%f8k95L?Ob)eguGU0AU0mEI<%J2V5KneSk#`a3~nykN~H2fTN?M*K(=wcM9kn zaC+J+!VOg?;TL8veT%=BtN*``ZkPMNFiAO9`!dD6Rm^PP*6J#sx@Z49YoE-mRy6bc z14eysH@3*?DXi9b$_S&T=b*VkhEAQOg#q2EYMmK>w=;N2QlVvAX=Qo7DY>Iexw*XG zVx9QaZ=_@^?9Cs}u$(>49D)ADTTj;)f3@6uFHmaQk-g)QiVHs#5FBMTcCw&}eDT*o zpDanrGGW|B(1T0Ztp0wTBK4R&H7j7eYGRi2Kam;kq{(YbS5(LQf);}00w&_Gw{+f| zdTO_8*(LcQn_Xp;;@S9`9JoHy5U;{Lo>3X#dsSgVs4jR@MOf3cK0Y9b7V*e4{Z8D6 zr>{#weZp^_l5X`6c|~T){3R@S!UZbVeEGq4pxJY>K__oP&3l!d?fi}d?YaC`zl2KJ zwS6ZqionzhMcmc&n);Q#=09OWwjQAoP@_e7i9hdpi0I7H)*CG1_{zK2Los_~%Ep)a z(1Z)kVowEHZRh=``VQt9Po8t{xYZOBZC&Cm=<0NarPoE|ojBpTLHp8_Re9amre0bQ z&!eZ8&JFFC7=5nxyE;`?I)M$S%dFY{PqpY3XW{s&?D$z1!v5-T`71hfxkDylN#$lP zeYBcVwsPy8t2*&+BP`mT+m6#aWqJg)?wO_mHPeK-BksypWKcJlvIjnboXcD9@Md-RMn&xajj8E~KirJ5owtpU%$r7#i)k#ooLTEwzztOTWbd^wDHW0^@%E*ALsx!;a@TtB@b9(mM3|^wd5py} z%%?G;f8~7|&SH-*)XZMl2J54XyAY>`p6Yb`_jo~*<@d86pamv&0-f!9*Ibc`6Vr=q z2=3+*0h<&VHkF;+PSoA@e?9Cn1HGs#Z|)uU6_Qy^D!RP+fUo4!J9`DYyE9HJuggkC zf6E-$+CMJ+mE`+X{$|%;#nwB<$BHXrq=&gQIhuyvC8j+8_!jD)h=g+vNu^irnohFT zYyeNQ#;G$22e(sd{A%J^!+N`Qj447lVb9>5H$Ng&b{;#S_oDvJ~7qj z`cDbsagOGy7PM{QucYrIFSWYLItDLPG`tKgYC{@^{!9*%zBt^zo3F#km2?^3lT($H zj6dbFhi|M(-mpDg+W&idRW+Uag>F4YZ4z1E>ZtmFn^56&DsI4DttQbxHPCoZb~ef0 zx%&CUw4O|}CPSYS)uWE=z9n>iwA!aNVZ7_7#L3DzhC8Uo##2NDOk(e{NVBcZ%e%yw3%ivvtQVGBP3GIm?)Xd zmQa82dh}DfO*k(U9ei*42LFOjiT^#70qRu3M~vf?)b8F7N7p}LU;O70O1)z8w?AS$ zl1j=SOlwwTg@)u=^}ZY+cMw!&rT2J7xa^4~9ZK+KVVXKo%X-^wfA0N?LC9B=OpJTn zvZK@bDVJ@LqCQIrX{m2?W05=+yDvE2zOZ&2D4icsO}86hZyCLp)!fLFpAJN6RUTXs z297uKpEQaJ-6%i>3?-Pp;82i~s^1riDp&s z^=xM!u&;OYxxJR-RxGobmTQ=t+)Hrx6k*tOF*2TI{F*VATe+>= zw$(Q3zqaOLBg#gYV-ij^<q!h*}k=#@hUfiW3JK2{$uRX~&8+NGDr63q#z-rEKw}0bY0`D0bRZ5lSc(pQZ zN7&|0C6~oqzrBLz-62PwKzHNG;fBR^oi~P4~7u3sw@Ys)ITz1>9giP$+ zI9=`a@l3@$dH{j2ntt3Kisg3=v7QwWxbS?d8mB6 zqa-CPrFl(kB-|*kkU^~3PQBKry+x;@`T3(p{r-!#V{G$nNI~TkR_iINaZm4NzP#Vc zF|J?kkIHmfl@Xtxww4ul!{iMBl%7od!a&h5-LpYTJ1_mNTuk!OC2dh2 zg2dR{;K?F96&u)`-0UDVfq{7@3 z*$d6wX$V5{RYo^m-cU3>fIVO+0tnpA<<$fFU`j0LLYuGwLM%Fi27j} ztspzWGcx-&+x*C-pq33`kSrW$I>)Kd8<}%4**@V(x!aid)WdQw726-ooZ09+DlY zC#IzzWArw1?QiB=2J$su!h{)94Tnhrg{n`^V#MQLk_GLA%Yx&eukqL@DIJO5H!BKM z!ZAGOvLXU~(`$`-;xd1_WL~YX?aLoU&794iO-Q>drE$^qyL%B4lR&WiDZIw`q+&yP zwg3KbuP`ya7R%aN!Tqy2d78=-p^pC=<67dN!0PmmZ-tZD^+2xo>=GOF?5L1jeDs*O zX?f>~byWc|wufM0-n%6{$r}!h8vGN0VZFwxu4a`DAUdyqx^RrLo zsuA6^jMCqHiB|&iDE0dUrQuZ5`t;$9E7R*vWg-gJB*`H4Z(4~g$G+NZem^7TTyRXT z`8}l9mWcN`yIRo6X_2>apB9uFGOy1U#jj1Ih-xwJvT8Nq4l7;1Z5UL)8p{aU*4g{R z%1C6v2}q-ts~Pn0PT~@2*fQB}?H=>5`lSv|{4JnL&015!H6~Lb%TiK%$2^U*sZIC8 zZ1f$(Moi@(dzaHYXyyKFYS!W9Hvg6A#RYe>k?O2Ry`yeKOKH@b=l}dJi`=%sot@_% zevc`-$?a~7CyJgqeYjh4n#(d+N2ic3Iz`{9N;}p@a!p{NE@o=0l#r-3qJH7B#WUF` zzE@pK#`_OQG;szk8n$Raqt{pD%tvWd-Dhn@_eQBkKI;wC+!=iDY?$UYm#buk+-=2)mmybVhn!?X3-^~Q}gH65{>j-Lw!Z-o(1Mdr&lGmgS9USKW z$rcT_J1wLsmq@k2xgYs_=kmwo{4zU7-M>@(tStfmCz$k3xJO+(eJ96fi9V}z!nIs0 zjL~2|wMMvf10B0I8hZ4)TjdDi*gQElo2%sx|AP_p(=qpRviEaVcJy(EHy|r5b5T-S zMpE_?MMhEiqJpxFlBBe}vb1y)qkGl=Nx1ImnU7=}r`+op# C0}TrR literal 0 HcmV?d00001 diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMask.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMask.png new file mode 100644 index 0000000000000000000000000000000000000000..ca40956499f1abf72df8753863635b24325e913b GIT binary patch literal 218631 zcmWh!1ymc)6UN=$Deh9i-HH_{UYr(+LvSb(oC3v*{BS7l#XW?yv`BH+q(E^f`x@euB)SIf`x_ic#DHY^#3B9MWgw#z;jhI zP{YDbEPr~K=kWMU@K#@26ASbIqkO0~=dp$upz|gG3yYNY|0gz9VKEaHRy3BbrkYvk zdKJnt$8N2ACmEHuCO)Li5=u$;s)EEki0$E7tXW49UA=LSeKp~b{)g9{>JsxSvq_oU z@{7(Le1E%ng=24v+f~8gIjY=Qs`pbR8@@_YC{DR$!;AqOzu#u`%l7Zm7bK-CD00g~@P zo(ihtd1G<^)ZpN*8znHqsvCWWxPW2sB}f3|XndrpEGfzu$C36pQx(rZr<^G4qZ2I$ z1Vf8f+KfBKN0f~Ku@ATs`1h_gh&!D~r7H-8=$<5yBbd>?aFulU-31g#u>%p@Iu7aGk4O`V7jv>Hsph0kP zD5ea&0}f+~j-Sp^B`u^%4t?u3!ZH|*p?eaUN1;lB|5+` z6d1pc`zjDnETzwe#2@Sc@#_nRQWdV00k24fCa1W-$!?! zrBkieyuu6`O4wh7FBxn}`h?+$VZE`&Ns}do%!%0lx)s z9=kp&0}@+0=^%ZleB&M4J4y6kVsJ%)lo%Q`PHj1!ccd_(QYNj0lqOCQoC@uoS*8SM zN3tRm3aDqs`kP7IlY+SGG|1|Q+whm*Ii+*{mG)nvH0Svth5b`3FFe`^8jXXje2)3a z{!`=FXXEq2U!wxF3kkV2#clhl8?0m=9*o+sx73XgG(i}yW!+e38J?P#^FP2P`M^Y$Kp& zng#mAmxbK((ge{v>H}-nqTA1XO&vT>Tp(q3uB0Aq_%0ck6iC@?F7EI^1^&b$c_8h0 zVLxCy``Uv9n6+mpI^QtD@1^!dNX-?CkjQ2z>RzRB=ku?#=uQB^_OA6 z`yd@=;WqK`wwURKS(Fw<7CF0uz==RMIWzjj@!LNsWZ!EFBp5KeQD4QK3GY_ahN%O# z{M5LW>vcovt}!PcK*{z#E>FvP2fRp9;Iv}ym3rl6iF9eY!-E~@PZ(=M!8JAcOs|yb zU$ZIl5><3Tyg9};Z6x~h_3ZBLBrbdPQk{YJl zs=U4VPR!jn{>x6(d0GUCo|-;H?$~)GO(X4K5V3w6PS8({*F<%sX<4%yxb@Qy%y||4 z_+>DqsmULb=ac%J%;YaNVYtI^JS#?ny=xKgkVbxu(K{8eb#M1G(NXTtPO6rcS$FTQ z-Ga@`p8+2gAzNXxdyblli`m;`RXKY1%X?`-Z0(M7O~f-oRWx)rqR+bn%3F=?tQ zB5gP}v3=sH)t$z_dR9V;=8O}>tP=UG$ggSXex__o=omA- z3rl)&xZDmEBfPIaER%^>#QZ9z9jc!)f&ja4Cfl$YC98J-8?pxw7;`&0UYiobeGkzY z$)f@7&QqRlKUvAIIiOoF?j7R!>PDlhw1xN9Xiw9-Gmii{bSCSu6tzj$4`5-USku1`Fj@g2eJyjUQt)G^Qn zl!r{Mj7WK{;+2j3T#b$!?br2wy8dK#-=XEe{QBuGipDl(W#i!6wcR=~|FUMiN<13d zgfOdEJ%jbdDJiyY>+=KueoRm#h(^boJdYQcYIw<;$+P_ri)MVmC6qFtjfJWopHz~t z%vpP7e{iMR{g);g8Ju_S)2 zv07oz&8n41(+e(*X}mwfa0g@Xw-`O;=d+zd6+HQSoc9SF_B)^s&lAbuq7&k+1tH}R zBC>NJ3Qk7`WwmPT$=``1d|Ie8&G<~l6TIIgaUn35IE=N*_9c^r@ zp0o^5cF~FSbZ7s6su>ZMSLMQw9b-a8B=$&B(la;uSzyCn9jRh26j4nh3U*NjvE1|< zg_6xW=7jeM#LC$-;7mhRzI}0kC`9f!TR?Q*;zby< z!v>4SfMXzYM(egVo#VHZF?cG6gY0>QNj+*D@=H!!IVo3+ENwWgFg{T{Wb4Qvd@b;J zeZ*({UZ*9QOEe_ZQaUvwkc}m_R62Syswn%+rF{*N4&byvdkFD34;a1N5mV z3o`65iJB=K&Ke_zz zw@2+_W@A8jaib0ylgdoQT`r-&z^1)~MvADtaz+AY>_Y7{(t%DqOh7O0t>ZG3@*-(c zgvbeu^;6*~LsRuFfHCSk7D)861b_Q8i&{4C>h=XJ1i{P!Y5#$V{!&)iwgtehY~L(n zgtKlYg_NSh1JQ#NzQgZ$=Qd)WO{L(KGGhpkx0ASIToPHzK6*hau{-^TAUqyPib(P4 z0$KGGUg9`pAMUNw^aAkp^@WWkMGkz$R$|x@4V`pHdidrZi&5gH?4#M}_2XJsG-w|r@1rx~=d}#`|Hmt%J>dhod zxB`;?oEZy7r3IU1dXjhOA-{1)jKl+tdQLt|kXU`_Yr3ZF;Z#Jq=$1La`ra4|T4&{q z=UEQ7pK~*cS;egY2&1myEA&cdNy@~IlSPYmvoDM%Gofmcgv+K0?mXrA3z;axl@geAT_q`?g5V(TsVt*gLG}%c=eh z{v@^kQ7==WfpKN9H;gDem7#*5H-+uQ8}m^TFmWUi2txSnKQXPt`zkCJLu@Epk}H7m z9MbdyXgaK6==PkohW|YYI0b+KjXv)v_alyvRsWd$fMEcVbRR1rL^f~WJsr6W{J8I& zYWX2I9FK9_`(;M)(>Jlohvsz6-1p?Vwa|Yvqjv5X14>{oC|gARV~Nx{6)^$+wM?() zweMX2U~lsiDYx-ns_sxyN|(qX`lUeVjDGZi9bDX>U^Z`e#{v4EFs(eiM-wEAN~HL0{J(dhbiaGZ}^*db(s?EJA+BuZkDZ=w$#zY0aGA~I;si1VWi)g82vr^G$v$M!XiBmC6{4 zsqiZUJLC#E<5V3Lx0#0>hd^7OaUP=|yr}`dJ?vvSJGpxP3L$YoBl=f65L1TIh+pIQ0nj_I8lf{?sd z+xz+ukVEbTBU}ZPgC=)9wB&w`PT|hKg|)sNNyJ1M^b;H);^Vv~{zZ_fjLyK{S|Vji zntwl^1YcW4zi}NFKdk~nTsPiidb2xG+e7RStVj}HMO6Miv{1&wc5(66;p8$Zj#1&S zL%>ss92^Gy?O7Bn>J$fP$Jz5jT>~SuclVwTg^S!k>|`^2>zk4N!4p4JHp1SED>3x% z1%e!t^)Q?p5fNM4u3Q#8mqA~xdFeQXa@Wah8V?~}h7`{j`C7gHq?yS#C+1Ea(aITO zopHF||AqVej)}cqBq&Azj#mm{Qz)em#rbLSU#51BWsM3PdNCTc1+q=Bq!Q8uT3;OO z|N8|I2_?yZRUavbTf+&A6|dM{4FFUj|8a&u{qp}^!h4+g*(_zt%jZ{&Ork0Kt6WuY zn*^PxmBCLBvFXM(;zQtdA_zw#)@_l28{%n*UvB(&J(DRCe4~&AAbqL+rWlx*DNm7d zUgTT-1tc=m6B~fHC%RWbT;@n}zqiP<<4Iu;fs`!LmQvD$oQ&=;ef~Bi0;jegjSQXF z8iKuv)fVRju-)fMk-n7_%=`hT+ZH<73_FHfsAbOpSjLwKvyTdZQIjJX$jbx{dEV95&hfJBzL9+wd$p~y9c)OM`MvTNir=@ z9LZ~(bX>j_!I|yB7jP>3i*G>vi&hAZ?J`PR+|z^L@QWSHffhwb?a%g-p2|%y5JSU3 zjIptUgJSdOzT+1JR(v8ah;xM#ag1Eq9l08cV_1q4h_#%X-d{dfOdNnW>EE-W6muV^ z@0BEh7Iqgm?r*33hb?>{AZOB!vL%6OrIjbNmG;F@!;W^M;ug zxgg2SPJ{Q8upo}|VHd<%1FK>xH%f-kTSech!_MJ^NY12pDFU||w0Th6sT1JJ?R_ZT zu4J={8lklLd#b>n*w+h$Ip;{N`p!r1B*^G?dhbt>4%RK8#>Uzr1luM<1XEn?w~{exe9x(Hn;S=V9g&non7oly;A_J=vo?k~d4#i0)@G88}qK6(ZE&zsvOq+q9v z<=V23w*T4ONfxA+8yze8iLZ~r=4_+w8Q(QFNWt6SN&fZ9t`+f$a~04GbANZGF#|iW z6HKDs;s>4cz7E{xou=-sYx8Q-hYv1XeCKeRq}nojZ51L!8l;yHrn_mx+YXTFrwM<`Aw9ga5h_vH&4r{Bv zy?F!dhh11;alMq^C;4MT^7uPC;G@F4jJqpY&t$F}!ZbR?r8>Ct)P(Up$Lmki);5eU z79BxMn5wsdktSF!R(V)*%F&gmLxI@WF_IPB_@2{u04OSLg!h&3NO4{NLgAhAnZ!#C z@Wa#ZKf~!9Y~%4uFm2Z=HEgf7+}p#X-ksl(xg^gB5Ua42qHCRK2&y5?eS$9*&IHE% zB?*2&3o1X?k1ld1U{HsyR&dUR&9Xv$$tc;(s1!*O)BaKAkDtFFh)CgKKq)#z2pfN3 zyqVN7c6^4y^CmmE(FoqBfd>amSFiFN6!d}-l~9_+@lg}QH3gJJ?n>e>NOIl1Yu6vrC6uJt zBvc7;=U_DUR&`U>mw$>*~Mg#qxzd zRooCUGLPQ{5Q~qx7sM;8GFZ7$ht&zE*_r*tD~@YypwYEkI_%eT<~S4m`_&a^8baTT z^<1)Ucsl4QR6n%cGuJ%SA3!4xwgiCb<;~5i|C*VrUkwba;4?yLisNV{d5>+fv7$(f zA`%dW_i~h$Fq~IHQ_P}mTDf6~!Z}+mI@s@_qQyxa4bOiLoWt|)pM!FuXbNId0lMZM zxFl4BSI++cKu!jV9tdoPwH ztk~YrIYzu+7^HNhVisDwbgS(QUil8H_;#{o-{L4y97_r?)FN5-vP}$uaKMU9u1R!y${E52Gp0CEv7&mEs*Q zBSBN3O~W5#H{&m$48?&RFV)Xh9<6qDB)eSZIVt1nCF+4OuvsGO9O_F+iszacpht%6 z#<5obRv!pofick&^NMlq;yB%<>7kQRG?{IZFRT#yYBTY3aN+JMbh@f17Rw>qXzx{lYtrzoRJ7h=& zf93k}sOj3qvees}+nbSH5-%Sbj$kpI#Qimnvt@F$6n);hK_N2wrk7X-THK&{Z_d|6 zCIAyvOi+`$_nIunGXxtTI9f){y2n}?^fs99G+Tpz1IebrL5K-Ihv_csi1-Bn**#3x zvSa{^QHmgBbhOn~_oSB}Css z^KpRCwt&L~qTi8!v8HlhgnkFF! zkf##|$AmIzLGEGKg8IUtwM^@6HlTm58SpAyJfoWu#G1Td!`9dERR zyO;LUMQ+QRS3XDE!N9p?6CDu@6G-UTLxId){9PWhH(7raRq!H1(Lv{Bz|W?9T0PhGX%%_&Kc4^w!3 zQP7Dup}lCUDfV;W6V^HH&30mHWz+(iJm|^67FMwU2l@#XCk*g{k*nD+MY?OT5-E>{E*T2#YQ=yR2 z95ofk+A-U)Zau9xYtG+z^r}H%pt{C}_o2RNkcLLKeUKEs*VTm_0^27w>ZMCm(&8Ew znF}qc(ZHd~BW^TedAW7vVV1Huu_J=excc5r!4{JWCyyX!{0{u8qr`x@?Hh$zmPUPm z4IW|!r(AbI#f5+emt4Gihg&$|$pK+6zefF&pSa@Tj#epB1)3XhN=TS1ihLmNN+-@) z{b=JHago#kZ_eRV@-carK`_o_*S$D57v>LO8IHSvwQsIO)o&N;^+asFKaAie7yUrv zGh!h7W!>7dTTF+Ut~%%Vxa!VrQGwhN+MtM0#(pmrNB!Ax{+E|%II}c`B~+mo`+0+* z*HQ1+R~4!sEa}V+&DaTMuH-kPO`i?EJ4#!X+V0vjlyKWNBkb1gC6igId~3UGmZwb5 zvXSyJVeET1@yM0`@DKV(qpiU%tMMcEUXqhS@jiMuP;W70?bj!-UgG3rR*m-R?AEX^ z+wex2;wxV~CXvj9Y4e57#RCrXNVE);myFb=_iy-^?pL-#!=-oBhF}>uyKVM$Mu+JF z_tBBad?~$Yv|6}*k`2J{j2-{4ooU)a7*m|Sc;nyALsz_M{pjMXi?k`(I118&q4S=b%&zw}_ za_(}PSts3QKN*Gq5qGz-d;G<};Zi7>(I!29QWe{9#1fusFTK~p`;mm?;EkTg7OOG% z1M#2RI@a4Ii!wDwn3HVnR~Zrb6qGVP$mY`3DlR6ClystrVX?of=J1JJL3{_v1BJV% zhWWaP$ys2NVVhCgs+NiaSWNfAUZ(P=Cg7C+!pyvD3l2|$|A>#z_zSc*_!$XM_okiX z>L`xy@zn_-OAVZYcl5E>$Ez(EgnDDgWp7)4mhGeq2LGxj8~Nx5B26J2yT%cDRcS$pUD07nH%4+4?*836AYfP2b zwdQw5a&`{OaI3k8+$Xo-Z(cIL zn7}aj(3P$QQU>*`_A%K47T-W<a@%}BrPpfV$VszK+ z85c`@YSY?J2;b&fB#hISK|q|*lkm7$%eeH1%2VjTA`D2J_X^8CUWFI~G5pZQzBV@M zAed%-K8h)Wirl3q=h&04ZFt`(b>H|CqNDJ4hF|YVEW4Y!VpUCNKVz+gfWVPx3uMuI zSNm=)pqwR3ZyLt$PT3M4D6UW=w!F&toL~LoZ{-gRCyf8qu`ll$vp2oWD?w@$kOEos zxQ;zKq=B24Vlpb=a|!d4)7ZZkE?TAhS{U-B45_W3qh{2_(2d3Q^kC> z{o(g}5jEN7a=*|Mjoto_%cR!YF&=3YcafCd^)qf%@%z<(qfEu|rjjInIvZt~8)f-p z?tOh7HL|q1#IHn{miwZ0@xA?aQ*3;k^X5z`UP3z+&(iIb$Pb{O*LO{8UyyRM1bpLP zBv;}HT##f-a{t~#eX_h|!Ib)M&&dpNF{0AcUc&V^(zLp@$S2Up=J^>v^fOsxES78q z)!)p8;d;{#S*G<$$2IIpyt6a9H+hWv3SGr#?w}>YAJEu}F>8vx~dW6{auy{p4E>bN;+-H?5kRyKK*tK61BAOzJCqUAT8jOgazv>n4D&~evCt4Xspfj{Q2mgkIQV>f zV!EVmzJF)%y{V_1d}tK-GHUuz?=su=rscc5VSIYSD-Tw*(Efp*kBhd})B*sWWOe9n z9=121{re$K+>Kdo%UkwE5&vfv0-JED7&WGVI4*9P_YG&Gj@~Ful~3+W#yGz!xxZvZ z*h~2N6-9IN(TJJFxI)i|*Sp zMa&h2wbK2#-`#kPM!*}xSGf!vb9bZ!n2(JQ*9EEVq#}ixNnCyPlU&1!{Bzz)O36js z%yKG%b6(yWZ=P?~E)CWAY`X1$G`t&i1EX+G{f-n}4@1f-}S%2vCiX#}Z zZ^tr?($h+>_7Vx}urww2|G*`{Q8xg#WT5_N=B+?bjdK&lWD(JQc;2_aXL0(ejf;u4 z{iBmHQW7fPxGo-Nox?nvO)Xun-nBU`0Y44lKV5c#@C;JJ`yO<*^Vrk zrpxSdkB_J}c?SQ-<8M$nz0fIOVl17SaCDI$ z*eF%;?xKkO6a->{tBAX={#^FaG^FC4Zyu7Y(=pJFP8aO63}Fq<+#VUv(0o5)r8Azp zT4;>?G0`o|QgLi>r};KQCr>Y2Td*S zgQvRw#?{B<{mzQRb_$z7d3a3^80Un29bR~5!E@X8-v(FLi-ysSY}IDB^x%9WSOy!} zkjGq?9q&0^FUM9*4vAckK5sswV&ofZeOnHRgGH;2uK(yWV^$9|E%p6Xe)-H^xeM(a zb({OAp~7S;yZ1AF^UD?@t_^Dq)l6%`ZB5@EpM&@fr42r>2=>$Xd<&f33Kq5ZmUYPv zxE$2>+omgwG|BU>cF@?N1T|`Z z6IFEham(>fr6Qzeru{{bPL92Img4Q`KF7~C+mk*Ub#?Wu$(kRN*O4g%pvM%$ z$FVeO>7_RzHpc$)zKwNjOhc8NqzWH!C7175De&C|&zKm{^QZAnrtDuQC%Sl5{AHbH zMJ&W*^=JJ7^{P6bjdy%#MTOnx{Kwjf*>br5(dA}dB)q8Aeb8v}67Xxp6{B*o^t4>o zuqfCszHz~sbbc?|Y4f6L;m8^g*PWUIvYwha9sX%R-e&3VzITeRVYcwvWaG%sH!pZo zG1V(fe+A=fLsT~&3s;vPO(^75TBm!IBb{kP3QMFms5179bKLCACtYe z&v9D}zyGeu;@K)JIvx~b6`RhHC7t@c{N7;wjMY9y(Em3HV}K6ZRPTmRuYUnv5^>zy zga1xQq94)wLZdFFY_LP9P30t&Pdl?KS731=M-G8rU*bkXKbzjh+>@HBFHPg`?DKUm z2K8N7oa0}fTY8+AB&Plf4%FX^kGr!f4YgLVSb5N2V@Ocm_p-1wJYG&+ zT>U!Loy+c09kLMF(DL!6FtI0+?Z>%-*g=fe+M*?I*{mB^=4^2B_$5!4seF*o?qX!O zQ1iBQ$4G~ng1&IF{=4e=3*FY8>PH1<#NbxlGQk%W=4lWLl6Q=zUdV_?4!pe9Z%VY15cJAu^kA4c~ z-W{wS{i`~3u8fY@vFS{8LdKhc{JxN{16e+k#NFP^K8tSI$Z)U}ace-41UCM3FKU{THJUyGHBHK5$gCyac$Eir7516n(qt|G<(Jz#v={ zSGp~p-_?N2Gq74AnfEXc$l*HXzaF|+^Zzige+l5HX~?s8J9^eQVQ!Y|D+Ml=-}-1u zA2gt@PiJ+eW=XKhHW~}x(lwN(6WKP_NjxoY?te$#I^V5gpOZHwVz_Ux+|5(e9lw_6 z@L@af+hSn;!MkyOzqn+(1)PpV_nd|%#YwdSuMUsT7QxZR>;enimbD)XI?vE^yn#GL z2MhOsu9gSiL4Ss2N-VtV#hYm0pz26mhT~4|=)8qCH9@aFB^-me0Db00v$}MAE#6NB zgv)@!s^oR&2_LnWO|iN&M27|5X%{w0AFTb}z1TSK{R0>?e9JTU)o#={=nh$<_hj(= z)8>V>UDT_q~p1X{K+j~ zi(nW)e`<63ZCBIww!f<08$R6=pXblt6pbvp3G4Pc>G!N$`~T`PEM)p_r)Ij@8(RVLUWT*9$S6B z^&)%j4XuJ~eG6Q@O-`jZP0WsxGHmo+i;RqD^uN0N_y7J3@mcs^cET>>x(bhv@~Tk& z{`h_*-^h+Nr3OXDK%RxX>n1GK{5e09B&QlA(WKQ<>=E%-*S(bv7Z*hV^PfkpDCGwm zo{F4IU|T(zw(1qDOfSA^ch{d0p(yyKl~s7YK2$<=?2zXqGU7mwk}8?HDow2SF|Bvm z3RxmM?B|U?2rj;P4~{9!0hVNsx)_W~vF@OaHs+-6x26JX8(6!R0yk53aWr-0T1crb5!5{GZCMTN3_lnXHl@ z1g~7zHqEh=mDAiaZ@V3RJ<__s+2!Vw^8Qpx$vB&6enyRHG8?M)Rf5<1RJX{lPv5O{ z>N~C_Z2xXj$V++pPG%@nNi@>lU&CkpJkREw&V0O{TxfHK0Fy!xBQcQdIp&#l^>psC zm@>*8K_Ke((?|nY|HrKTje8LB;LQA8mlzE5rLb|JJzbe*&A$Z2e(zpigXnm_A4X?Z z<%3Ggld^W(f@13Ao)(FpF8bq)0GQdb>3^9wm`T+hcR+i}JF?PlMY~_n{mFp=$(2V1 zaxHpFIG0^gdU#PP_tmhXwkJ{WT+gwozSH~h6dlJkXc;Yfu!C)YB$#Kf5D~O}81hJc zA!Np@IkZW4mEQMVa#m`x=CK;Vg#VKB=v86(+5X`;k+r!4=VoN;0{SHf%phHV-0>dU z*Ijh-t+At6ntMS$@~fwaytwqA8!2c}B4(yP!y~xOXF~|f^bh@7vB+$l2kTCuh{D9T zf15#g5ZP7LveO68^fHm{&Xv{|`x2e(q+-mdeDj=v^G>f{S_~}d?4vo|PY!nI~pQ{rfeT zbXe5mX08_Q08s(WmkUoiTlF@Hk9oM=K>`Y{NKWY$W8OSZDLIzBdd_e>YMd4Z#J+T23Vaf2GI$$Fy7|Q+@3HP?;Vh6ew2W z2rDmSjeWytPUsVP@fTDtC)RPP-*~?eNGzCI+~gTB?Fzw&gq{Vq=^C|;Z3uG$aV3d_-cR$q=#r?uPaETQ=p9etqNv zz|H1-*Ohu_5C+r@7*^IHeAyDcHpqNZY5RF}+XCxqowuVRpm35-A;o7LD|BB#9v>aa zQ{o&`aHg51|EFoh#^A+q1d#m?FHmGO$AzuYkh$|;mP!oTG9I21>t+7qHMy%2VO$(u zopAJeZD8;1zh7C6Baaj~5ZE`0%^XE?a&Yyf2`oiXx9?xA5K)9LR9fbe8na5>jEic)u$ zx{!{tAvRtzmm=J`Bj21`Kjt%1!58~aNY6MoNa9a8-Vuf=DpsLex=;9?bH0s=GrH$6 zOm3k2UtcXoN3J*4(^l73nV-tvJoPy)Ch8JyBbWQv*9N;V(by`H5-=m??YkSB^xrX= zn?h?}b#rQKXOOgpiop^;g^=~~dR7q;G$K9;&Z>AEt0IBE2x=BpZnVg6Ai54%8N*&O z%UwDU-%;0jsbvnsFe1h?M(b12_glBk@XCG7w(+tus8c@j6+4PgAv=8xco77=RVNRR z&Bh6`^jtERo|_e4uC-oA%hvgZ3&+mKvh_N<*!3UaDn^6AC3g8^Uwo1(pL2^H_Erp? zfJSEQvAa4HtpMnEb}Pn9r*RIvsAq6rUBb`B-(E;Coux&6sXmE$5EYq!x@nVcYobjY z)Qmz39${oc-{Q?O0^p2Fe6YhMZd6G5bLj)q((=2{fD@q!LI#%8 zeTbvVQhJI3h#xMY0=Q5{6{A)@2%rBx@7IMQdiPctI4iPC|3kUkmt40oaev42(}vy) z*IvR4C8SGjOMqb_l&rzAZhK|>Ccm;U9q@PSef@Q!*6^|uN9hrK3(_lW@-CTnDCl;; zt^CAy;P;Mv_pit{gkA6VhTt*fxU#NWg~xXHrses_OmX4^hGTyPQFDGkK)L)>>H- z-E}%Kii`7zwGw!bZJH*}Nz4o9ft~!De+{WoEYCDVlbmz8qznS12vkN@Mj$2EJDJQY zyT!e$%)2LWf~W*w^bWLShS}V1N*eT~bg)_RM>v?1&W(9Dr~lH}Cc8-NP07i++#SyO zhy>j2Ko*)O-d{r&4&l>h=(@fi9za>I7y;UCIJ!?Jf8%WUeaFd~am z-gL6f`FQLl*ljz&ZcDyADH1+i0|>SD!+V+r(;_jyrrk47CQ_~E>h1PYI%h86GP*Nq zC5RpWzob0F?Pfc)cjA;G>9JlZQ6l2Y-b+s4`%tbk#wypL69MVPe!I!#`mmjcQnI$K zzn}_-rwg?W|H=9C_5%~OqdpZM?6k-2;HepOjFjoz6C8c*$^lOL{GH7c#<%!<10=uQ z8vvaa>TLadR-6%=S8`0c18k$khzWp%4bLq;zaVvjI4a?t?T}*hUmZrPzmTycJO;8@e$Tufiu+9 zo-4oQTw~ZVqoegwlY#kt{4A<5GEbUubL=>$aL{_Ldy-FUXXmU7EFLQ)OPRm;U#aCD z*J1gaGdqW!8x;1hDp_1pk7EzPOB!YLhl3~f;d&kcD%CHysHp2jeC}RBrW(Uhnr>Ih z5_JDm^`F6U2Tl7wc(3HA0$#nNo*sRgWT&5U@q~lq`oXJgXdea8d*DqUXVfme5=m$|cx2ZYvVx2}E_H=tUexSGKwO zY_~A-O&oq`v)Opk5S$Z?S3GeS;ltNCG8-L*-1%gt)WP127G+Dv7l&=HW9vl6Nj;L+l5NP8>;{U>v zBYiH}@b6C$--Lsp`TXjMz!>aBaT2X$1pzo^8z_FiqYSic?_c0oY?;1Xy`$8Zj`YU3 zs+Nq$d^pkDZ?3#;_Nkt&Q6O-{nva0U@ z*Q2M|jb~xhDR29>*YG5pZPT0(7=XDY+>+;3z+?+^c2gkH<#3sDtFgpj^``oDUr?=O z4l%(J%#INkC{Kod#Qo*>uh#POAGs`LIslz_PUPrZ)QS@`l9n5mIkTEgfewUjBZ|d+ z%}VaXT$C7_uJ2S4Nm3Dm4leH+E6Z{rgDT=jDp2Xd$VK^D6jbJ(eaWqcpvg0Eb>1nD z@D}{>F+JMhq_wa@QG=H1-{y~eMD*;;VN&$$DDza}tdFIdV8Y;Jdy}IJ z>}v-)k_>mNi97|&JQ~RgDbJL7@@I_(Bf*&%l^7VN;Zrf?Z@IsrY{Yc_oS%xMACzEs zq_Tb^!!6bRZJOAP1L2OTBH{1ZvKr)3l*|9sI1q!A^np7c?IoPNu8e%>^*H{PYsP+t zT!yU%s?9@`m*?HGroO;hq3fn64gla+kuM!y>vQ(vb%-htz7_4!G7yFbQbvnRW>t%N zfoS$>Y_YfKwl>XmO)Uozjf{M{o6ZJI@A(KHHtSKzj@7zk&2rDbQ)`lUC_Tg%Tqi@ z^htte);9PkV!w`&&P98g^DfqO<-%4)IOYp!``A8@17MHt^IwG1U(Sny&~Rr&Kss0y z7S(x`zj>E2RX+BlS%Q{oY%5Jd0od@jIBQa#9rhRJ+%4aA=a!b=1H**v&2{hv?7PGu zZ3X>_m?TC|yaxG7Z!;eER5+%HVvm$>NLIF;N0g-WN- z%X))>1ypGXOo`@QEFwQHnUyaw_LGn~q}&Ho8f zi?LFe2;<^nO+l2UC0SK{6xU$14v^qf#bo8>qzQgwYV6r-A<^`X?SJRVyf; z-t_(ZXfEb1_Zi7dhxw=eTYayLlJ@t>>yPhevql=EmnGPBFmWtV3gyI~T$+32%|13c z2r|WU3t_CEo`(g2NcifQMZK} z7aFI{X3Jp_9NK>t!8P7{AeHVebLvk1LflPZ*CrHO;@_JOTLK89kJzy|)|LTdq@hE= z>;RT&5_}C7n3ehS8U6oPgsJNe@A_bcm?p4rjd!?ahhU|9`rdqsro?Y4`&?1F6|@@~ zngYNP7t~d(EZj{YF|Zfb9TY&ZEzDmpSZ8cz;}ltDd(Q9zC5mT%C%E(72~V_49ijhy z0~%@(SVD0=rQ9r=H{QI|A_#8)ztd|knLDnt8krHAQ5s-GeRkwmco;69d=wED47^u4 z&yI(zciUN{TK~Kn-XTQ$E$J5fZ#<2|A`RO{ZC}JN%QnlWKKseFIg5m<>}sl( zS-)fC1=@o*(OL|(&`;T?T)!5~0$`;3;~|zjxT8CA&6H2^+sb(#qAqJ6ub*m?Z}kXU zEW9!#>X7@D5RRacKwO%3_m@r#DzUgPO`>L3ATsYgGka=Uu z;zUn)u1&uW$Ir?JMh>u2XqPF$m?UmKOx|`W>J%+G`)2oXL&(!!(wUIa9ZE)MUfy)Z zPu!17j(1~X)raDyq>DhD;aFejvsVq@xN<2-w`OngFye}`)!*Y7t5s8c49D85JfcBG zgo)7XZidpi^<^FJv5Fz(z_ra2>s3#L$|K|{1DN&}H0&%BH0$4r+X+cfiFKbddJ>~1 zBh*6h7>c4pLi;p7<(D*!t(MZB*Qt{SEPsL4hm3pU(;bqfoOvnt!6ZaBI1kcZ46(^kl$fuJNO#3L+u#y-W zAM6ZRK295r8uV7XDB_Bf^fc(L?3nZScQGPmG%>UjInri=0czoIInG~vPtwAybV+aC z2afKSB?vw4RSC{>|D88R_?z7_d$5`cDj-`iU$$l!v`S|i+xji`ONx3AtRryJFJ_h; z2;F%CH0lOYY0B=O}JP3h>dP!E|K`~S~^4K;W>P6X(8Bq5X_E)1)2%d<>uEspo zbaQO!iuqh=#mf((Ix`#A@=C}5^pga?MZfo=RNI+;PS<&jAZ#uX6E-SZa0?7Li?fc{ z4kCOG7eivVfZh9`V^|f*0Ao_vp|$+>2^2(P>c6c{*3}TSH6(tO1B0;@XJ2Dj3_UTL z8tkjoY&izB#iNXNUpwCcn1>UJ=Iw`kUfEZY%i+$GFe4sd+(>F4PfrK%O(O*^z=HIz z`nm)J7he{42f_*M@6OuaqQbnk?5ESqun$~Q%K@0D5=x8!f*mzY4N)H(@}q7QXK@c; zl<*P@W^&J~2J%<3)g@RklV{hoRWsfQTH8q<2hZNv3th{B+sf3a8Ylr$Hx)N&@#}(_ z;?^`LdjFk7+yjgTS>83X+p;hBt*zE^LM$hvA6_;pgkV57Rl`u$MmDA!ItJ1%KiP&4 zFCjJ1eg?`?^O$!V28XgWL((w`oMG&v-#KKj>p;2T>(){Xc8XQD}!zM(B?Z>l-tjLCX} zqD%KS+|je5^Rn49n@Kll%sUd_d}j;Mu+zxu>c0!Z7tMYDz=Pey|LR=jU&$URY5=IC zqA%K0DS7NvGyZ7N+c{qS2L@WNf*PR7T}O!mlyQJ%`qEq_T!Rt%UhLc& zMZA)UCmYv_FYp*UdytteYG1cu3(~bHnX1UK#jDa4 zqdl`I%j|zbB9c45B72}!z25|Do4?}5FnExV40YYc-67Z?8EMt~Uh3)%IgCMhL1N9m zJUdqe5tI&)PYpIF$Lt1LeakWb1JFP(zgt&gsPjz2*YIcf7G~Uz#+sF|k^1O3D>4Oid>T+%Mo9=Ba0l`&$Y1VT&NHvE%L z`k!&TXRT#D)*zUIvLfpq|AUf0H{iG%_>s8DGpW4MWexZ1yq|+JPWw;l{zEHdOE%cB z3ka+$G1N^zH{rPMDRs*RA~8kcdN^No$S4{?ODJRqE7d9kKGSc)zxzZ4J{$$a`2 z&dT#`?bz*-Xdrm1sC)F;ONG~J@ENR{s$1s2Sw@3gmmOtxD?15`@x zCJQbtwiJx$SnkjFdcR+HZ89{Fq?I-`l$PE^OV_`X^zT*ufmQm(+5gkl+~zaEYk@c&1+3KMQpyPdr4C_4C*%eJ(Bz`=h~ z4o=&R>@(F9%O@=>sMHSH^D6vV(!0-bUgEATa`s5Y=ad8e{{Vl{-u0lU8g5r!&?LR` z>}qP8CAwBPw*X_Q|>vJ-S(SGkmNNlaRHVM{*$sk z&x-zST#?!FZ8vdiOB1iFL3#)N{aU_9tgvI%ZRkk_QPTzWI`n@*OYhIQ&C^L1|B3AQ zUT8{wR0I5heuDfQllF|9x@fvV4ml@(%E9`$20y?RZ8Ep|!?x5wE;}=JQ}Vk*KUd%{ z@PpuYk|W->gA$o?V@h-cgODDWf(vk7n_R6!z^* zi@7G@Bux6H=V#y|9G8=o*Wf6tL^WZMSE4kKYDWN1%I%3|_*koW-3|NBYvJ)(#R-XE zOs$PWE^J_P z+>#MGs&2^ffZ$;*&F&YTBDeU!CI$tAbO6qFEp0E!y|%KCvYSs!c8^3siPlqoY525q zSEKTGqU>6%)bZeuTFFPk`{AEwWVq&C^=Bjd@d)~q=$kh9hqMHHT5h|!KbGidikV6< z0@J#w4*gFmLN?-tCAOX1%Zn3BFs?-Bn1}zQ*6=3T9R;zF-xTwmQINLJ=;ydX)O|XC z-c|DA4wBQ-_$gZ&9_Z(ye8(P1@5otisbolx7FQ?@gBz>!pM)_S$5K~7R&ScM|Tz5JPm3vf=GxC`#ijei~X5kPk8c1d++{lO?2ff;4}N^Z1%uj1|nXOIN=GYITge!^^b zuX`h>&4WA&409#~rSe)}TXc&x%y}*7Pf31hQA@(ZZFK@GQ!wN9zMBPYG4!T00(uk@ z9SZCfT3aXOd*$7>m7-jprgP%@r0~X}l+)7e5wU(XE4@82r$T;Ez~v9Pl@5Z{Pf>(+7}9B$}%Q+DjQq-{q4%}D$B z$YtdIzp0@R4j5BGLWhCJcf`H@A9DqQJRE>&F>E1q0(rnK3~4G+{;DjQNfq&~7e`m3% zrou_R`hE%34Npn*^*B~-yNOYoY9&Z@-U$<6y>LvGcSFvDO&r^pNtuUX$?!=i6%-#I zmgC)YgWF2kg+*=2&?%|PjsiF)`c2(-kkr}nW`6f5oHT6o7h1cgly@viV6}yUhCIl& zY}jELT)CiAj=8X}`93DXd>kXEWau{i_p0u+FDvRB(B57Y;s`6s?rE+Tl^`v;tVPpZ zdfN@o&dCn$RN3dL47eXloapz`-{>g_#sNcM-YtDjo#B+r&X{jH*_^7@K7d>)xTH^-8jlP zJ6N|~)Krbs4Oz{L^{k7f>DJKotJfl*r#j%ISw&|iYx^4VQ;}f5E#2->j=tcs=8;pq zi0l$Yp$1f@mVe@F=Up` z8~kK<)Px2@49KSfW0XZCtSUXWQ{&-dT0M%I5dL zQr>N#w+M^rZ7;f8PAU({(T$;1kUSW624p00khsWKBn>xcla%so^}39aS>2ITU)_a} zBE=zwWo?oZG4?3G)t1CJ8Qyjg5cFS*k~EEyn<^7KqZ3C$7Q|>uWCRox3e@fXi%#s} zL}<%VL0gJF6r^;1%rF-@AsItVy6r{V8S1W2EkTxU-cgJZkj33BdTcqtPNF7pylyCN z3uv53w-{zDHcIS2ciCGFM?i_o0hl{fP?2VeTknyR&`lk==CbFqka|ba##!RD+-KYU zMAJ!MyN^z2D%EY|)iG<>HY27X0Uo(QepqzKl8s*J7}~uB)^vX-AfjjT%VN%!AZ%xuE_H!DvkBVh_iB>6kx*w8Z<5FdB|3h!u}HKdTy92);mioz$2GPE zm6tVEz-4*3MEaoRKyD#HvE`NI`xD1B zIBU47O472?&!!uPWo%&4^MXjySk>6RVbZ6ORX{D5cgkl?m#S*`Du&4o+n}9Q#uiSg zOFUX~xMBlp$|{Lkhd$@WCc0W-GZLs)lPXc}aTO-$pqvVt9F4>cu%I{a(IT+`o9Wl0y^&zW=A76^6Rn4V8C z>M)-(%^f-Qi8SykENVgL_R{1P7fFh<9*qfuh8Si(lu=eRigW?$iU;;Mo$AvvZB-x# zg53&-+YmL^yG$#gv99|J|8h%vyO+a%+R_s1vT$gL@3xCOqu;G+>GKj)Y9h#{%iK3a zdWlO9GzCRj0=tan41LfAm?PGfW!-jWVbgKKDr^J>*mO&%2b>rT7GysAa$6e!7GVV{ z6%AtlsNVfpQa$H1p*f{hSA)A5KwXDLSamYoDB1UM7=qTdqCeQ@SV?ChS%84y8$Gz0t#@Bx-! z$b|r6SA%Ng9BH!1T%aGbW^7iOF;A~fYr(6!y4pY3&fO*~!#sR~br%InoyKknsZBB8 z2Q5kO(9b8Bhh+WnOC%!s^%1c9RX*U*gvj&{$1ALLkU_#mZVvqwd^vzmhe{z z7z!N`voP)qKqa2z?Q3X>dlq08=KRv|kJ>}LrCPb7i*4e(qZGDaRzq1Db@-?WDXUpC zt0hd<9b$YVN2il=9Tp@2wKaY-+LoRN zcs|0a+r4Mu2(#()0`?ue@JqKBHRt8CfVTi{&e^o4{Y+!8BZc-E-f8K)@oiy;Ltq+z zPD{_5;rp`=olzi zx2c6>AEXAovDe6&buHM?T$=@GG(~)$ovey=fBkMCN=rBW2A+2ZHMpuJ{2{VaMx;~GM4*RmqD0Hgv|cSK=*{X;Zwdt6^IB_5Uk8C- z7u@|SC$Va1Wc66&XhGk9Y6Kk2+yONpMlK%Ta#6ux5PxzLF+Nxh1J$xvF5QYz4f|C& zo3rp6{A%zw)gUmSY+ON^Ml)r?uaw~}-09j{Srx$js)REFj+{f$)b6%t@SE1&b_|7< z>PttmJ%lC<>TUWds(ihq@5fp5w<+86 z72MI*zIC$*^D0all+~BZs{XdLUy$B+;epY=r3-r1!kM@!%`0n1uD{F|-)wc4;@ zv7l+3wv*dy5zey{^yK;Ly%`clVS?)@*~%WQPb|v zJDz{*`cz3MNSxhh27RESk6PUN*wcUFQt?Gs{?K+Jur2rP3H$|?jCRvl$0#cxopCMHuv83^T^uHL+xsTX*mld z*lv%U^s98ipSbk8FXX5xZo4Y!AK>3cU@%rf(AG+(Q)kTw&o@Y)ze;*+!(Wrh)u)oo zRJ2u5kg_W2^E3R?HOs})5JmU#mRsY{)KKl$a2Ngt56l8@DQ;Ml%V5b;i$2%2bovhb z*rk2i@;PnZgfBj-i3Y>>U&1fKf7QbusifDumNXm8t;kpU8U7PIu)XC?XG4@+3^>*8 ztSiPh2fxFQ@O#(Nbt_rVyXbw>`GE~LPcGp9Wf%REh2Jc&n(((N>2nu;+?9TO|5H1g zxHHouXy1n>{XG2_T<4<~fNhD+IceW+`ZvSZ`X_tjc(lboK?H>R`w^aa1r*Mo6aD9% z(GmX7rUsfmfM2`t-*#!zl6tYMRL*LILZs&@`~*M2%U$8mO8|FeGLY^25`No>|BCA~ zvtnLcfnZaP(j&>g*)I6oPH!g8^s>m;mbCBB@H0I3^iz=zc;c zl9p*?OBXF)wXTV*zwNrWeA6XiEb`NIOPfE!E4Ztr-*37mq1D{SEnihtczR9K5TD3) ze7tj-q;FcbWUXcUH^lvSBmn0ed3CZMHD~qsn8Bt*?;UC1*)I5fw^GaDzoKN-iq;n2 z5%sQg;oo{S$;LpN@_awSbNChh0q%AURNZorTOPV4%KNOC*8|bdn=bmPWw+d$+^x&> zSk&J4U$yi*ga5XIk1gk|SKZ$75V?7*z1^No&I;?cTtB_#6DvWxaU#&D^bv zkSaY(2@Tb0ihm^(;id^z-*U?@Ce@SI?MJG+=ES+g9~sRhGj}@DN*^xMO$~l z{|n*&oyPf&xV`ru;2AuCC%cZ*o)g_J!JP0PLs==;_gYEtepmRH;GvV%Q*P&SSqk_K zJcU=g;(tr51ge%C7Lg)=$i9tU<3FiY@8D-{G+Zzq;@jIZJ<1%S=pxzZC!Oj{nb+ z*q`B?hF$cyZBh$LeZGcIJKx`gXR`F)Yansn<-Oj)Q=#zV&iCId!ngo)8p)SP0j|L; zJb?S~YS&r*OOiiJ8dFjVH9-Gs83CVFY_N;;wI>JT8kI7nkbF}bcuvJ1dp=_@sjQkgS>6Q|Pn7q0Khfmk2{+xwN~Jz1(YNO_499f-eidzP z2zm3O^F5zFjfEIjC6^hY%Oa!Aecn%m|7i_&OGJYoVNS(bdp-j&sck9)DoWav1^E6z z_)8A|!*1#KiZ+6M+H0Fdcv7|bk6Ev#q)I4On(MNix}Kpd{E1lVlQt?X?_~X*@J}iq zQWEj5h=e}xPd};J>POY?o|jbk2p{3|{`WVaQ90x46>nJFPh*?MT(4|Z zTJeJopxf(-v=MGf!!BYephaPEKl&enZg~9{qXIj?BfS7+>_z$%w2g4sk*r*EkS5zmwc;Hellb@5Cb&yly zKJf4S{`emh{Y(k|rcKAQyjH_DR5B2^jWP^bXc_AD7W6$o;UQS&g zWpjVggB`2UsNU|FGsX25<1jl?X8xLstAP#lYoambuX6>Lwp8*kFMZL z;7_s%#oD$xX|o$&zqgjS`1~-?GU?m%Z6^jTQ$# zC6|ndWy7r>e(!xM5-4T)ck?hL5bN+gnkQ$McUbs~V&S})w|u0v#sznUspH>6!5_Iv zD@nFf_OSG8$jfKiAi6(e&F{6tvsM?T6A7rpiuSJe%|)smMzrmGInm2s0>9OVG>^Km z+}=;6AP-Ag_E=jNUyQ$;d;iGaHxS5nP#*33);Rrz_wTD@zT|!1A#voxr>@2EXv7yD zYaRT;`-|Mbisl!9kaOSvlKW3wjqR7bAG=bGBlEA}kADpP_;$%JIlqVh$RuPL?)n$Y zpM&H_bO2gC3Gev}-A}LT-TKsbu&F|e^~0Q`pnAz31?Ym8wF+y8xrs!30R|MNv)0bF z!z}$SX?W$JM&qxmHFEthWLVP2y@j`?pE%gbNc8=Fg;*NOv#%cJ{a!Z<%gP)#)sT5G zC*(9ZYrt9Po4&jBFdz)N@nx3k+c=Do)~kBGewU*O%}{HHr=NbAN0DOxtE$33;E7Z+ z*PmU_6N`O_!j>v10HhN?Xd#lJ&td|adu>0#*~Cp?QP7u?eMyWx+zK79;M zYU%KxM(C|8%l+2(ckKF%!wF>wdo?|}qW+NZ{(fQFAqAKxwfVtZ%vJrw$Gy;yYLX zE0q9Sfp_p~pN-Kdz&i~mwT>aAS*ohic`5k!d`gPZPHTW{A`Al9cfT0#xR8I`ZJx10v3Kx(*9^Ibe^ToYCRC(gg)(pUUz(nC0peqtJ|1hmOURJV z?L+^u=teS^UocaqnpxC z47n9`cKIRAwI~1f!X%uAvu+@FsM49&()N#giE&Rq6K-XJRVluPcl*oFVVHr_Fm3Z- zRb=r&^z&&S`Prjs;xo#T=2d49as;no-qVkbV(P4oyardawXY-#tm?|#9>R-Vc`$-$ zxCA$~HLoBg+=LZZrSnuJDc#QmT-L_+Nmpf4mkgNI-t5;qP0=Epge!0v&M42(3y6ST zs>GaodN&8dsy2DkKS>A*DzDT(1_ReeX@;Oy0p<=}#fS+*E(q1Go#1cUCzW{Ab{T)&W?aF_Hq9gJ&u< z*aiQm;f9u;ACm`JQ-*jJ9>D{6v1^_>g%fZIE^Bj+HT?xDn1{!3M@zeR!T*Rh51mq; zGVd0Cl;IsbfCsx4YL(y&T!*V#2Gr|vG|TXEfBeTVrKRgvv^>RH47;kFcB!%f#^4-W zg^RksUYBEAgctA|+=bUW;V-~xZ5%)0c5ehHas^((6g*Im@-Fmq60X8^xCoO>stl1dBfDBbe?{>l;=;w;ypKt@{xAcA>zQ?;(*~D5~ zbx~DP_I^bzR&P~k^ln!fJ_x7bqOdrv7=KkoE-&F8Jlu8ZdytYO>>puaTT52%_d3exuhDd8GZ)<6M(X}oCRX8onAjEavbe&;PYfHb?GMCiNy$HhK zyb}!1{T&qG415dU!B^7VMVBW$EvYaFs2Vc^|FjlY2KWc0{j5pfvgg6CpWTW5OK$1; zq+9$JT=)}kMvHp7;ZL+Q`#ZP^C*=Qye&s+w$Fx?U1dn#|z>HhF9L)D&*Xdc;4((oH zWWMYDA-DqH!c|G{e&@(;DGG7|dZFF5G`#?);2Zc8T!$Hb$NbL?ZNd(9SKWOMvDOxU zuchCEZagi}%5rt=8wCyi%UZe}%>RBjs40liPulYlca?hQ;0*i#-)ogrzZ-ZEoY0f5 z_VV#g8)8!TcgrnZx9mwpX7!+e>c;;VdlM+!ibrI(jmF-hds0@$1 z5+8@ll75%8%P)xTf`-@isc;KLWw&Xt2q$FzUX<;IYhvxuuAY)Yl51WD&%oF4C&AzA zTJ3YLZlp)ra08zB3mk*XZt3?h;6{_T)BqDuT2o`jwhS(afxdTprz2TVsBULiTcJYn z@hf}(m~6*uuEO7@MKm;HeMo|-s#9#ke@Sagk1Kjwl=QdB8CG|-DYIf5|3QKN0nSS3 z=UuHwueL-51sW=5UfTQ9k{&nUw5khnlg=%$oKw+C2LBUq6@Gwk;Dib!^R8bza#Jv3 zskC_y|0&`BtlJ$KWrq{@ySj_2$NwZ;hd*fTagVOe2JQ5#?#Wbg#p8bjF2L7XdOw<- z)f>4j%y~D%WxLy;q_x$5hAX1;*qKO8mk=zg=;OVI|10=T`0EpqR5Yb31lC7k7*bdC z)=mGnv~<1XV%SYJAzM(FF8t?U2Cl(vp`c&ck6>x{!l)|eEvxj_!+*mu&yY)?)O8EJ zDqOap-nJ7u0bjwN;HGw|=G>g%yq4Td%YX~`|LF058g9YYQq)!>9f{_K+z6W>m^1Wq z4gLtX;glS=wsNR}k^5cIV?`{Sk)MJ7&SrO4TGw%2CYk+8Mt+`zTkr?Cq)W#2JLjc3 z6PJ~E+fe|0a$df2Yj+!NOm_IOJ~3liHQ%#ddf(9A=rN(#W(HVAOVQ0sTCkdTI00g!DK?O7Y!UUbPebSR{_7~-*ga29UBHEkX)xk+EBoTDT) zSm~N}nS*J#qTtbxlwPFzlUQB+fuz=T{ta`4$K-sThfz0~*2WIx)P*0xC0NqH#8r>~ zV0#9pYdHg@t>Kqb&{RD3lLbOba00GomyV}yxOO5!7?bGP)W^3B|EJ~oPG!6Oae$

rr=%TMqakv(4mkasAyXAGOQ=S zjME*X>P~E^dDw=IG&OY%b?z?)IvWYnr;$R=5jqa73qLw;3RwpKfq~SUe1mCH5nWUiiYlQF$jq< z*^cKG_Aq1D7tpocAdRX9Om3p=$yKegA5rMjZlUeuVL>|Bcz_AG4!7m|#o3+3x$X6N z9g1kHo3@k998~J)tSZm)*ZlL(bEt#+V+3H=x zeB^v;+|Y5R;0D}=%L<(vp4!d~OAOe@;9E`zf#m(|g;T*Q%*gRVp3xORGcXeZ=|921Vrl7x{uy5>;seBQ9<4rGLZiI&bLw zc^H?Q+H3d^@DDmgVMl1WS=UVn_kB{fHpnhGrS8a*>o2OiFm6jmNJ;VJAxI6M#}#)k zx_NDp!+c8-#W9r$b>JTqK6{+Fwd&O^H*u;8k-BN86prfba2<1TZ~N19MbsvVLN{k9 zQ%PfW7mq5uS(WLNxK^09h$oa8&A1w`4nIsO!j)u)wbk5?zJR<*IIBoS2L1uztmVv| zhEqm~eEBKCpQJFT)?mvUd0pLdVTTH|phru_OGf__qUS_oGxMUt#A#AwO^>OSxkEn_ za9XLmybK8Qed=ywUO`LN1GDEsYwT$y2hHD%o$ru1T(-0%-fG-B-N`eGoF$I!jq_WQ zA`R7PTR6Hy&&SjqYb{=hq<29-8j_Kta1zeK1Qc>G3dfc7Yig5C)9LV*lZd^llsjhl zAA}ht?(=R|7fyjCGAu%o@wjTlw&l+$kzhf=f_|N)AY z#q7Et>&qH|aa;NvyT07KvpSp35el8g|B=3QxL`&NTT-!`tHdsJu}vXQMGjYk-D z@hfk*HeEO)Q1ze^GQ}br1KBCJxDm4rXEkp@OR*d))Fy!;I40jAaYRGbYcRqZP@U&+ zD2M2EOUz4dXKGa-Y?q2j1{4KyK2Kvnh0EqU*-?E9mno1TB^3K$C8xsak^;>o9XL3l zdAIxq{bQh?PD-Iq@#S9E_S14SXR!#=!P|`v0sax!HDSTSyu9I>v&Iq$>;N(*fG6a8 zSc7X#UyGee%qx~;0>55@AvKQVvZ!u5?VUQ->{p?BAy3)tLY%IP-HNbqBXwcHk^w!i z?D;wG6s6e^iNn9y0|gc8r_kupc!SYw@TKi=-qcU$g!~a(dT#t{;clGEXf{MsmOxJU z>Iw8Sr~-$QDwk?m-vRwgBaNbpOgsELD0(hA{%yN8A5}Nex1fC32o&;Ed*v4yleXQ! z*_u;{ZOM;e!+)Yd`@@Q`;#$C(ZsNAbMP8heK(pWmW#@HyNp@U(?66x>wfra)J4=U4 zN}1VXia(Mg9!ncGTrYly&{DS+3b&qRCo!8y&WXdC=AM;Y>1oRmP|L+HVwJlGn#m`W zEkDqdh{@@%HuU9o)*ZA#0NcUQSnPt__NLo-m1Oxjbsl9aFP$(X|Fz`)QOyDSGTgJzz;M@OnGcSnGZxC)olX?zQT z#LfUnm6PuY7HE&!{>=T2^;g@@5@|Y?G=pqA!P=)v33Gf>=lI6@^;GxYV@v0I)H&Fb zRp61BswL5zI%EZ|&7~SFP*lr^Wy}(Xt%i;e%Qy_a+O|Q_=eC>1?vDXd)h?7xt;h%` zY7~Xcydzsv04*oLO{mx)kko8!A7nHV!tLEkM3;GGqdV}IB)4-ed)IU_Ba$Cw{k#Rk zA5@FlRMF>F6BztP+AT-XIUCGwY+Kc~rpSTabi=$GZvRn9g&LWCGNvLu7IFW-A?Omf zbiSF-h8<%`L0k^d!Y@q;@*o0pDUM5ml*xut|io+7R2HUo9z=^KJ{jtB3IOD*_ zNVdZ(O@|`GwK?MU8Ouqg5Sg0YlJ-cP%4<3$=zp;^F3PmfcgqdrXglP2@w|a zzEAMh^=~4_h_Ne4j9p+j_Pl9IqwQq1QEnjEiUM)1E)mcs<7{|}-buptmz!Nb(RMdy z|E3``z!Uqnp0-=@E<;PQz94iovjC>fR_>tMENY|w$mQ_5j47`SwmHeNREaYtTXLdJ zpQk%TkVU8>7sB($F#(Aa>9N5dxgRn^HOc;j<&)Z8x~FyZn6E2}(#nR>j0aNv*Q#Hd zY-VoDeJ*jUYE8D}oHnexC1%^RjJBM~jT0cbEIroU_@zd}?HujRroV8efHxl-Dg&$I zYVz97K(m^eg5#>hQF7Cz`8#Ey8?fnCy_hmEVrt8au4$}71!@g9THXS^MzN9e+>1KM zhMNQ4sp%@~pG(d`iLxj#OZPy#VW$;_jKFJPWX{b)c%&J6Upp)pe_;}4_4QpBh zl{m|)t$%N0F{%cC*&*30)5xtuGJRT88FSe-Ry7=vn44#8oMw${{P;nmM57$<{F}*$ zn=JaVe1OMkmWWjpfsUmN2^;T^xU`-Mm$**VIy2X8CEoB!~EpPF>303oR${f&HJU>p`dcy zbzYerZ~8HIx3_9@aT*#`v+0VQQm57fdXsE^$%cz!s^YW+B7~|4Ggek!o&ngMC+^<=% zzI-845&kkPSsC$`6kn>g&3)e z<(M`zH)}%`xF4+ubxniRtK!>s77ga@)ZF0id22-4Qc?J-QbDHgO^jPYtD=oCZ$cTQ zG2|kgbUuO!^VpF=Q)@%tTMZ`~^Y9){$%6OBX3&>8mknChc{e=#@8N{P(s_4jh!PsQ zy#;N)`2^L5<^;{FDyql1?UA!AQ%60YVA(Hyeh+i1!pJ$@Zx)D&!HRmy-`G&5O~F5} z;Bw+BX}urO772V(4P~<__~&5Ec>oCDnuGmv~RV>A_nHH4tA<-4x_hP{RCwL3g_#oZf1Wa!^}<(JN{!8|O$1`O!47O+g5Vck>{$vb#w@Na0&)EOn%3%b8{)_^w! z9!UNZY$eEW7;+OF+Y%ayquVV}{{#319vK7L`{SDbu^l3Io@+xR&7Q(tEuC+m?+myV zOQuIH;%Con3BPM1M-2(b9`(Oi4O~9!$hB4e`v`u9`_>8AQeJmV zwFHT?H!b>Z((@_&2KQ|uN?m<&1FGZqxxbu?4cA1#cm3kdw%gcl(j#`k(8!fjEyH8@ z2_9G{bt9XXws5bR;uiGYa`^w11j{K}vbrF%Ze5#qe}bpxg!CwfP;}Jh zeaD~}%37O!*KbX&>HGbvjmhtLKNkL0wYGVi{%czLKI{fCwp=wsJ8QkX)8-|xzJ&2F+{#+qCE{SY3)hd@6KHQJ6y2jMJWUw&WH+UPsF zzh{1#jNz z{l|v?ild)`BVnKTi{yJf)!O8@o`1vt%Uf-T%VD%#{4!{td+N0e5B$5ilYjdZ z1*$m(5a|lHU|yS#e-EAb9s9EEk|2rxdRxD}sn)yaa0h;c=Uqchw|3YYCi<(2xXr<1 zN&hEZ!>D}gW$HL?OCHsdJf~p3+}DoY?eEvKCcj0z%NhmvTE0uS^t1Pkjk8+>VM9|zo|G$!4#GV=dH6r?ihpx9tN<}mNPJ1tWPgHRyM~dKoes~tu&32G1lD;1e}kXkMb}X1EqN-5 z>*cbf&!$FEzR>;O?;6NjQ_Q9zS#Mrj(?w3-xV`c3T#H2~iEG)&oJ=3%;);C7A0_=V z^k3G;arG}aTG|x--+|xYNf-Usl!vrfb1D$3T6eYRv|so6Hy!?cv5BPtS7kc=DBF9p z3;vpZqTs69+b-<(R@yu8@4SD%Ara7aMWcqihUn)N+=ZXuZWsM;hy-wUQ&UV;Rp0Y7 zC%xBqg+G=KHxXo2H}56S>7Yy|0O(v z$GhTx8J@`4UUnnp3gXZOjq|+^FS<69)Zwip*0LMiXimVUW=Y?Hr@M}%pBK}w!zGx~ zNV%4n>WxP7Ki_qn@+Y`2MRO**lB2GBeF2Z)5xnghN8Et7*%0r98)4LTOT+KM)4lMI zJ~gMlt{!!?t-yPDta0M=JKqlisU}J98u&Ujwi6z2C?Xpq2!wy{j9*}SQx?Q@zz^?9fCv1brwG?t(!gAVy`A2b4} zx$Da%n9|tRF~uXd++6*WswxKNZJNR(mLXsJnc8#@Um)6hiPd@{2+)ZBYSMkTXjPQ^?tnB)7L2dfS zHIp)t1-Bv(bawxRB;#5Lb4&|un&Qfjy8Oj`;P20(U{K>zSKtG@5&j#yzMO+&TG~D# z?SXDdiJ-MS?+e z1=YD9{g1naRzr%2mQ^Ki@cH{;Qj>1RWy4yb$=m($A4`6m%I=fcl>7+vzrN4+C*YL! zp$sV;UzGv)`oM)k{Z53BY4W~Rx_wgp+P3u*?fi{iB{cfo@a1J#Iposul1u0moCUCY z$dzWP-nT4%Q?=J?hgs>?r;qn47*P zwKd_te&G9w=-1-eEw>WzU=rwcD?M}at~az2v~l43MV;SD0cwf}9t!?Q)dT}>@ArnQ z7CzwV5&u{|sH(x6t9zLq{wZYfpNjssOBeTlV$uJwoam~Mzq*TLuSaVsGc7Y<%K{l2B2zVLl(5B);?=f3#-k!xtEABOICh2Osag;!`Pts|o_a+!oLr2oW)@xSE$ zVmIRCOGqGf)2;S~Ki>0lET7T%vhOD@p<4T5_^m4T$d8fW7t&AU;Qx~ETj%H(-v2-L z{KmigiNJ{SO0ty?V;+Vl=gL)3o^MOdCN1~scD_M%S4#`pp4=L*!b1Rs5W+OAI| z_@~@{t5wA)7hrM!rOo{g{*dFSs55NA?-GqRFF^E{)k% zB>(_`07*naRK{0Ro;%Z@vlsp+6(jAFd9mf*U)Y}wO8WkU8Xgk2VIsi4pBhPERKrN8 zpx^D*Sb-(@v_JfVFs+UAKES`IB_E#-q@OVbc!#yRX-iADJKd@Kyr0wiV{V*YRo&3@ zzQ1uV{7;C4EyuF1=xj)E?8nBvnmjt;I(}+el9K7K+x1CdOkw_G8ve2-%3rV@6?;8H za7_5`$=?6mfg9J+B;GL?5evtzPUC~`uiJ}$#zg!U%Pbf*ZB^rLIrR%#?IYe68d-pbTwRig2FFC8=tr?vWOMjsEoo1w(xu}UO%kF@u|3E?jkx|GyMx}rS&OLz(M zyOQxFT+rSotFo$V>3n$k8NA&K{vkLk`a7m1bEG8iTX+Ib;q^Z7pVr>4Y3;5+9^^8- zfyeNC*D9Gv+w3mt@S{*v-B>7bdIHbk{a*Al4CmpBzCY+1L|5PyJXVR^?s>wJ`1cB& zb2h-btgpBFiCy52VH(cEB~?lFxfuH=x3T@*UN*!CoQCr-BLiw!9z^IaelGki?F0Ws z(O^zt z!i75B8R&ugJ7r)GoP}HPEnIf{xg)pZEf8iuB=FXsA%f#t`*|IX>n~aTc}2r81|Z#8 zria_x^&Om*^s1|bHiiv-Jl(ZYuL!5%YwZo6b`#*MQlh7|ry<(8GVO_N@8F+ujgP2L;W!+FUU=vi z#)fAcuE1@$31`Ip);6&PA7NUV%U$3fhl_Aii|fWzrkg6yd{$ghf@eE*a7jzUZ|JXv z&iO6Hf7W4MByfM{((B`J9ljO(LvHH+hRWPdX{`OTo%AyZ=Y{{Xs&TZQgDNa&sZU{7 z_=|8BzK7dd92lg3OO59hIOA5jz3-y`6WaQBC0ixZ)W-QqxfKO?wG;nCa0$MLZ?*Kh z=xWY4;VhhlY4{Bu?z8~{|9Cx8b7hFQ2u+&baEwxvux8 z;imTfoz%>nNX#ERm>FrM*E@Bd?x873^;l8mZ96E1lkhG43BH2q z?AE;y$scojRq7f#(|M-gCj3Rz+wXE)HZ5UTPHk`tUU(WBfQ#@w{8<;6$o7vl>}gzO z%K@Jm8IWjk;h*8UTin>zAOl}|U`>?UdVE6K=elYPM`XU&l!lmcW@!_idpoF4Tid?X zCYyf653F?z#TH}NJ++PhQ*aypq{Vf;P5>wBU>$TT)tcUp8-O$L4SX-@-J4zZ*bwrq zaWez|5M0#K_DkA+WX*=5iL+}E!XqEcN#F$Bg1=~SVV}6aqyZ>Zm~b0eYo2}r{{JZL zKjM~F)l@VwqPPT5h3DP~?157fz(&7mb}aI^^Q)t-UbR>T%n$IjHvc9f;f9*dCbefM z{HJXhIte#Ke5al7(G>la)JJT!vl;mN;GCrQRaq&CNFz~7Wkdp?DYqs=Ki|O*vK~tM z_ZE*HP*BV&Cq8-juft!oG``O*$_m1x-_^YY;bDC(9r`;Dx1IJXxpu+Oz&WglNC5t^ zr=J|0f$v2>rhS{P`)*uYy(})XEknoPB76(qYGM?c9}%c<$kiF7^slx==;t>4QHy)~J4FuZcCAG*8&X>IIo)x%thJfPUHY~m8B&lnW%0PrRyEiU7i7Di zmWpU7OP#uoucDf6mK0d;NRQ+2HGBtGw3)i;s0Tql3TIu(+#62-m*G1{z=_MxN1Dem z;b!7*Xkv3G@^THXX%XCzx_pqhFDR{)sBdP~2rxMRSK%ADsKs%cZsuobLJVCDRyDT< zFYOLT;iB+ABir2ZiTpYEyA&lp^STaJEgK1p6;Bri=IsquBqcDBwaa&bx`)=G2|*YT9oAdnR` zuXH{#CHL)u%F~N(;6>sZPTKM;H{glEKMLov?DvxzSdOrw-L0@87F;)1+X7s z(C5|yv_!x2rhR7MnrzRJEU0bi;6CB9p=QPnBarbOYol9kPC&F{2wbRI3~Av`;gG@i<) zZffuPN}&HDoPe+3ilk?%ciNiYnuh_Hb`Ir=De%x9bOA2H3GKp69rLv`b7DY2y=7R2 z7oLAF!ELw!$Ffu6ZFWLIIpr<6#htQ_5WBbuH{gWqBRjMGZ$pI8(765uqn|0spG)#x z^yP&AqiZ{lC6z+!(5D~(3vdp8fZN*pjdO3?&I>bbZg;Q-Ou=pVGhC8`VWS!%yFhDik}aAjVRD|SmHL&F`^f&dKge-Fpq>Dz+eMo{EsSsMHsfqR1V zJLeY98SAvfgNa+%v95%AE0{l*;SX>VCKdT=yD29%4f83AjjAfUu7q1T3E#jsa8hQL zNgv$Aom4WgqKUEVz%lq5{scE*+AYm$3yX1fen&%P*Nfo;L*l+0&h|~*te+$cLg-yx zH2O*5EPMyI;e<1g&3!Q_QkmZs)4{fFSe1x08q^3lq!AWC(+>q={O@PARkuha5$ z(j8klnr;GH4L*7JPs1(vTJM_xHsrS*{=)@-)__co!7cgDrhQv-92>6xKc}DG@c18u zt8zchb~m~23%N}hR@{VRQaA_S!*_7f`HoR`Y)4yCB^bf0(!q=?cybBG-OlIOnXi$X zIoXyC*hxqR`U%$CgnYl$dD@Yp2{uD0R0b>t1yq3Z@F&6FuK{y8$3k^C&neO5i;sc+ z$Kkqz-)!$#C%2g)v4C3LNk6yXqO`w7ZW?aiM$xvEshf8x1Y!*Tx8b;3`i{eVBF-@u z2#J~1p#8pqKfzTPaTZ(Nox&=G3yP~QdiXDi{%7R!C~y;RE1NCNWt?!WumlTWTH_u1 zo%+wYI(I8IF2V$yQ9Ln8U_^b5{n;rohCrMi%xh`*F~2nYgzJuNxvyEB?-o?tT+kB= z@&ok3loEeMd6-o5oL4LKIlnZ#M-7_8l5bva$8Pxy3aU&fBpWAi3@*bJ-Cy2uUqiEw%o;7i zuxeC7IE5kA>R@AULHnOBhw$=58ct{G$AD~C%x>}bXq-gQz!_n z$Z@Qt;Zsr_k%UgmU1TEq>vjCMP5;x{Bi<{TPlRxPmlg{>C15-F50yV7GV^oUi8Ha& z=Xtq1!B$Tr7*T8cKz46fP2+kRj-&eFw5t;6+~17bSZ>x^lrq3-PfP$A8DVFgv0j%t`p46PpRTpe(;iv1HgKOp2JN3>+Tm}j%kH5JBgtxhXDX}c|( zuwS#CdtlAt%YBOTG@Z$9Q55^fCiEzt)@uTEL~-}L12wjtWNsgO&;Ze%0xfU^Nx|V=@g5|B>!8mhBv?%6h$Tf^?nrZcu%&WSax7Rw9>hh2$8Ps2(YcL9&st<1prl)*v^rH+CAH%7#aiKEkov!Q&iQ>s$Skdj+X zV8ECu>oqi;%kI@c(eZB#t&$rJV5i0XpV*BcHTf0BNfZl|M_hg`??eqQhfx(r<^^xS z|7t7&I^|LiM)t-wJ{nMvybOujW}ixb`*p~iZr+T6#L-eX?}+H9F|6c9Z+5~mLZSJQ z!My0x6qph$$4U+jh)yHtJGV(Qlx#9M@NlQfx!_W6^&K+A2Y-BAA{aEO85^-CESio!eOyNh80y=vJ*a3vL6qncNn@Z@9~_U~b+9JLlNy7dCNo zTib%j@YfVNEFoINjfgfI*?Y@P$HPtiVsdR>nk8c`MG1eO0t~oi5!U?X5z%(Wh*{&kqW^99OV02#nlaH?b6Z@B zR^(S;tIv%HuRFPG!Gor1Hkyu;J1m#Fwy#2VT4u{>vZlLKW1#?lqof33>HR1_Aw@7LTFG#y1+r|0&RZOg7(w6^YqVdMxY&QA5NT8c5vuDrC9H{9Xd z>QoCdJfBi7xSh{dveJ|+i=129${s$4tifT`JhrtTFmigw$C4}BYG0I1Bg9mXozD_ZQj@yQ#tN@85@-Vy6;3T;9aW{ME#9uW51eKn05j@s zsqT{O5>hjW%pmO?vd5L3ARr@$jXBntv)WT9Zmetr3BYZg8@X<~ntQXU zQ#2i84MQ#WorcRBm|M_tI=Ex!#?l5w z`F@qG@0&PAs5v#Bq5nvcx1u{snwu9N4rhsYD`iUOL?hO%b;>gpIE8EtY(S*6} z#zv77sg_FJwu~C;Rjy~p3UzsLhWECkkXBQ%{)Sr^6FJtaIh0y>w4(lmZPnFqob5%L zNjOBnOymnGmb=8QkX}qS-AU8CA!APs+&R<86Z0YfL;F;YLJItFJt-0J#%jl=>#HcBNoD*-6`wRAdN1kj7nAi#Dq7&L(nxw8z zHgEx>svA~eHk@6a#n$U?GDh8pm%V=S1ZP)hkXJ1_+;Sq#yAL&&z4IC`?`+!+M8-d6 zD{d^=ihw(#Q476lrWLDW# zQTeNx9Cm>ww~Ypa*sRxd6%8RIaz=-YAL4_Qt)LQH?F%yWu`ZAl$g4Bww3YK z(Mp>Y*l>2KX?IJ4CZhhjQ1CtQ_D&y!KiCeNPxdbR35t@YP!`+ z4N>T(=rlt={VwKJcmKcb{DZpAzY=!#RwMu$I%>n2S&?%BQ`NAB60$O?sSPDDBLdq; zl8q3oCcCO5ckmzJ4=BvJh;=Kg%yJ6QG&I~}8LAbBOz(f2xUk~2oZQEzM#FRzSXpJk zTM8t#y^oeniL`a(C7W(jmIA74yM`~`vB0n)zb+Qo2=ue61=LN4aFaD=NyPfNKXRv_ zdQA$zH*fhPw4GcjYft>Tt;{TIo@F9I;4{d18IUc#zpSwcRkKM=t^1lE<5|-X4Sjz# zxR_Oy#C>)Jqgp8tStaes)vi+!y6`n*~}>bfTajWnOi#^Pvo? z;+D2o^`;3ei?v%`yP+k=AFUB;L&E~9a<8qH0@0&+4GkwSJMWhr12?j{qr|yzv6!K% z8pI{2R5j`0vy1uEWKic^Y}R7ai!f_LLE2j8F)yB|=rUW53?mU%Rrvbo@xK7?RTkNF zq}z1LsI8#Fn&t{ELA5UUSG-f_%)how%xd=jKLz}?RFVEkG~}7qhYEA9hIXS1|Dh&w zHJhxj={j1@rfsT*b}le*O_NI2oEosaP0QK$#vyB};p;R4uDC@OrXMSAV-muMeSZNK zx|VLw3I#QJM;1<~xMWaZzy)|0NWLyJI$!;8q%{YrD=J$ zhO>`bV&t-FAh+q~qXx-u>7A`C+7K;N-CV9EsDy&jKulXA9<$(!&g4ZM1fOBf=*ReX z)g^~;Zk-wQb$NEiFKb3WpY+f5thjEA5b`1_d;eWv*{TX!SKNxeEmy>E!HT+6|0lor zqz%hj1hA?iI|~QZoM&bSsc5pvrpN!BblRp{bK1<_TwMnEC!6b3)sUfONB?!_Q{64kWE^L zS?w*`W`GayLaWSL&R%P}=tM<^^E<6R`xI2&3e1UzH{HHL^OPE{5;7qBCCnTAtME<} zW~xr__%M1?KM|~jS-<&WQ#2WLqFGIr3bYtoLx1@tybj&;ZHNDl;@+CPvDDps!|hX= zlfkhK{~P!$iDUMfnQsTqHzYP4_){vkV;=gsb!PA~#!vt$JF#ktS|FfD9W?{#6Rti}Zt}Oh$u}jtB&HM|8M;2!+VFKrG~QgF37M!J?(ZOOHI50BtyxM$;Ns#5&@s${gFxsS*M={2jR z(=Uu#>N;ny>tajXy|?uffqEXouYPf2+Zl_)D$mKeh%pkQOE3pd;11liwaIl?)slCP zPunGJ+R|RH;4a*SSC-zWIs6T%jkM)#|EBXDX5oR>9&eALXsIxz-<5+{2D>VBHsBLH zghyO$|7fbd9>)|cjfjjWh;I{(#L08U1Hv3?{)12=Akr@Ce?9l60%@jn4 zJ~vT?Pw+&0zvqm8647sOHgyxrgErsyJ^T(o`%xx!89KcxPK=$^A30tO;}3q*+T~@V zpN0&rUX|CG<7pOTOWNZB{OZ@*C)5;V-vcERgWOlM%Ne>JQZW&ZSE`Eb8m)QvG&Cf7amxJa$XBtH%8mK~q#7 ztnShxu@(=$hF{=s@UUy3o>faE&faSaw>5Qe&%r(T2ly4zZuRxZ)B3N zrS986Kfl0BPd{Zvd3#(nyl0>mtPB4?!UHe8$p$Y{7x}WtOx3ztc5L&-gE=5Qb$9 ztt8u9Kfl6Dqo20;FR#PLE`-rgk}Vj%cVs;JdIPehqL!Lk6|I@S?F`s8(a$gNxQl)&l0Uhu_H8*=IvCo&!heF_y6C4O zB57tb#a2yHbMXHNKYIhRt&lByi?Pcx0Iy}+V5}hBRuVbzb+l#b^%K( z#|SZoC-4jW6@E9{+v9&zU#^IRw-ia;P?+Nm{4DsZUGQ(Za5-)X$A%6@Gzd zyVAcUfLhthTZDJ;OtyQly)*E)-O3;fU|ZO&Ebb55;}>|fb0YJmfXhZebM`#6N!({wq7_$0(=aqIh-P z|7-E@&+xcw8gNbg)0WEK6+?DqIxEK6Yp}pKGu88=*=ng{Y{M9~tmt!qGJ`E?- z+n0zRLg@CnkDhitJ(}lzMq>>M@&=X_`hLFa%KPw?)TTY9!I6EU?SR8qnulHA_5Da~ z)@R|gn$z1V6PZ`&@ZDbL?}w9G1~8&<0Q&fQh3Y->2{t-t4s#sZY#vN|Rm+ZuDl5-*5I>W&#voMq@Fi zI=%i8|4OF(vXb=&JSBB%_h%>gEvtB;vCqp1jB0Fj$&GIeM%2>&>vY01pkU&l2-xa2 zmk+u0dlbf9v%kH(1gm>-UwBfDc^Gsn4cC?SU)qax{4=Dny+f+1-I5IWbV&RUYnw)o z;%7z%OWt$d^%>MgsbN>MW0!X@{)b>(@284itZRDU!SLsVLdyqO~y za;_HPHO=O-nViX_($CRSF~|{MU_GaJVP+5M(;3L!E(PJ z4?{maZmHM+Rp$q;9Qb}wBEaBpDh~1S zK>9cQheHVS$gVtP-@W}65Af|ex z_aoPw-*z`N+t_&Wk0@5YzY zkA*Xi!2GCwC1eu61pkQx!7`DDeImDwrS)a-$1eQwCGsnB^2hl12qMG|-*jE^M|g4_ z>pj}%FBAmbk);z7kU0sMt)mLa#Oa8Z;$=tsz9Fct!0V9_WCFT!R20Nwu=vYptdOOS zFJZ7Q_>U%_x(K}X1p>|{57ZQXJu>{M@^nXLuv}Ka9QpoAuF$9MvapAJ3a(V5@dYwI zapm2nAP-Bzk#nz&_>M;ZHar4Fs*fxJQdgf9{M}MJ$ zFSvq{FE>X&)E;j+C4qp@Eu>ag~$tt-B>au^rOTs$jT zYADsacwmSlm5Vtl|EjJK&7yV3?f(qGu;4cre?H^#{LSa4x2=FkX?1yB{S+Fsy~I{jR=bQQ7W8;orHa zAmlBnJNtkq(BCl!zqRr%9~dHtU5IN))5A9vpIAJM7#o0b;m<-$%UUpSaEP*B^fTb* zUsewpA_yU}V~THDb>WIaln3+Qpw4eOjt#ZKE*=K{lAC?jry*n^Px|pd{t5UWQ;5q# zjGGEk9sK|bQOKQZ~^GRKZZ&3WZt`Pg7_-{}jA6AZV z-9>p;50pPW3egWML}*dQrJeFKqW}irB%IT3*#U@^rM6uE>|Ps>QIhHmT+rU=yvqAO zzwbvZCuD!}fL`_zN%zr{N@=kPL`a z=<`8IruX}>(HNYDa~gl&>qdcv{<`P;EFJIH()rVxo}5SltiT++hFAL}sR~N6orWoe zNmCV=z0$_}kNfCQOVXZEk}p)MZMr1c`+btkGg_T=QW^5PGGw!^pJ?|dB-u`?$Y4x4 znsp`l-oTrElH@T=!3DVN*0#1(Aw91=<(^5xUhNV%qwn{+!@tnR_>cSGe-h5RBx^xA zu21mVE$!aZ8nYzxjK=qeqNgo&cs$z&{s9$PT+rqn6yRSF|Ge1;{+zyl(T&e*d#WC&aL}hJB^QX_jFut4R7W{0@(IEfguj2`%2cq`k#C^+zpgas3l` z0Izn^Km;e?27C>dwH(~S)MZy`_}i{a-v{6ld<)mza_-noHkyS==!3_*Cdzx@3|x0h zyUhYAvjjKS~leAjI6BwW_o+2b0=9xF`|3N^;H^nGnt_;15aA=}OYI zaoq5K7QTjCS{yhaBQWT|pxB4u5xm_ggM#$A4A+JK9(7+8pst4Qd+<9v+ll`^IIp$4 zmtk73)gAm#;4VDcwa}{or{OwWRh7n&l7tmi`h`l>JG;_P0;jY$`#PL(7UY&JxLLRd zcXrlE2mGJY_fKj=aGwUIh9;HMY7%(2EB;SwaoDvZGPLOSzF!sn=HaQ-GW5()<;2zRAba&x{toUex{^t5OO{_GgkSFU^MTD1d$~UH<_t$ac0) z&)`K4t8OAhB{QX_Ic+Vx4OeA;`xvg3FHVZ9Uv%lQV{je*349~&wV@$x)g6p$GJn*y zbh#HUXzBGuw|oVtYM}S9f+`I;g&FLhh3~X9eONI_Yq@R0sHOk3EU8pQ_becYry`H8|(kJ z%MX~+((_v~A4AVZuMj>dsTc_Jq1R!(qMzGv1y0EPx9Kmxi2iD+Rmhr=YdV-)ZsafRNvGLw2U+ z)6msu7_5hz@Vzz%k7_-5tSFEbEd{IeY0$x`-Tp0c@1$7I2q;ou@06SWQ1M3Kgx1F1 zf|G7_Rit)}l45fuN##wyw7pLP@LRa3fe+U3-B3<7?{>y+z|#;9jdlBffXh01;&`*i z33J>uuT2mCRbhX|jmhsy#T|1KU)EsOGvHa7U*}Yz-qP(MJG$rwepYO0dJMfd54J|07(3X~+GRgtcsPLshSv9{64Q--(x{lI|V z!5MK$QzH;^ZgNB_ENsF<+u3T;?<$Nr;EOVYiaL3uh}?!30LP^LzlIapX(7l4C}Br$4LHZg0PH9?4Lmy8yu_`BnbWMTj&NC_c zzX@kVzetA_r4I&VL53@M<>~*7+|TpU(y0AArzsS9m(VWT()1jhhHtbsxL-71(BT6u z4~dtmJK>*#ujISTK(CwCZH@OuZDfhm*!I@r|04VV-)Wa^$;}6kv2ek zjPsqw#iT7aF1oFCM0Ir@l)e0%glk$FAA;XDt)b~gzxTNI<%-AuxYkDB(k|Ox$?4ct z<4mfd8`y%U&`N^z_!|DGrQyE3-}?81!(w8ztei+kxLuI>aY6khIVZ!(j&8}hrO%yy zxfy}*l|s*ChjrOy=4HgMz+#XgqwqESXYe%~lgVZyLd*^;Icv78v`Po9A$W2FPD^{) zkO8y9TW*;1mU5CEBkm;JhCj;rExMT%hyjX@svD}j-wM*}8vFp)V9FUjRh8D}9UuXy zvJyJ&0RIK}0ltOP&PHg7(TohN1--7a;d(%9FaW-T%P=AMQ#Y`pDFsv#8Gp8k*JE%4 z{vzrdkN|Ayvyt>pqW`p}nt6Z%oP?Xw{$tYEc8if)WsuX-_K;Jn1^CavAK*`LS({|7 z@+EQQFf9qgwgRgg0sfotXSktFv_(g%i6h}~d3EJ)b$6;b#U0s#U4e-QnckBFPpXp=)?)l@7R{Jqki-@qy7 zCfZ|^SCkW6)BV>_@b+uD|Hnk(4ap2EIkhP)4c%WQ$e)|=C-@3R1*g>rTNO}KjMP?5 zNC*BtxB@>&e)YOLi`>lGoEunM75{YLKMCK$_X>a5v>QWP-@D~0GxsE;5+#)7XD~D11NUzH%>acgh8gmzwYxCXW1{S+MYOT$m9yK&I%EyvlhZH@mQRHE>tEln=LxDvMmZbVK+ zMyTN}@W2opS2r%eh`I~ML}+>EfkdK&g4TnFEZ4N{3=Y2SsF*X(4vLOMgKZJeD4c}j zwlqBC4lk-fbWl`QP!)uq-q8!=>L%=*e@uzPX(b)(_M2{ibLu{RUaF#l|ByRAF6uMn zdj%e8Tku)93}^h(@F_Jo56X5krZGFbrI}vCFfBu)ga6ZN&FmNHHC)enTQVpV7@g2x zF71SWFl+nNCCMwQgzbgna1l=1()eCAc%M+^R~O$~wR}*f!%CEm2kID8clI&O%TFA~ zECkezWWpf3@kWf?~lT1 z8IERS$If3y-cX2Tolr17s0s$*gjjOWY5aP2fK!kHgIYR%Tw#h%gYzZ0pwxv&Sn8N6 zFBUwjEIkoSxYp*p6S^%os3-UleQE_h2UA`0=PXR<{3iNs=E47;xc}ObE6cV8u(4gZ z^xo0?DtfY9cb^Lh5FkMxq9?rw5CjQ&kp4j5q!-((fHvLC{8DLhDET%jQ{NEx~ zzyNTbnxfC{?+Www{fthlvUbJF1wO~fX1NCdVB69RtJW3iK`w|79kIG#!6WU<)h?&SY(6NRZ`O1L!CdbbFM^m!{l%CcCOr z(qM?Aq>(%SG%zfNSFN5;e(=ZxBLq2bU%((R&bhiwREMRoV%b|82IGr&%-4g(MHmB#2h)~WRC4E5`z zT#0>_mXAjt`vxodC6_h|f_xBJq|F}!icWf_q-3T>HxLZBnDJhP)0lmx0LNJM)4Xva z`}&P29u@wa@qZutF3b7Bi#q?Y6lh6&2*+by*^(*Wqz5aK9OV0eSgAq`>D8L{sEvNaZ~&(6#TEs*ffREtziNNGP#X z4m2EjY}7=a%<^`u!d((YpjKgxZ9rHDg85>BU?4o`yrE(dtr2ohEPG#lslb^fNZHpi z$RBU}72|ANw;(kWxlO43J(lA7WvgylV!Nsdn&sUixza?Gd5OK!r6Lacr8<_0(D*#R zoF!h&#HAr4a>EbWqs2W~3OBbg$mQ$t7pVJgI;IA9IQ+(CS+ zG6zjYfI@^uMf&v<%Ac9BK(vs@=SP_7Xlo*ij{4;M9#u$#`^Za_mtoSs05nRR9U5fn zzr`1h1VAm5yL9j02Na5k2RL<5&bynQtPM61Dg!NG&{)Q~bpy;4_+q_`;e0_-%w#FE zlh>qZ%uKSSva+!*WtfoS+{an3LTGB;lJ@2=rSfH-gtq2Q2M~J=*$3eZxilk@3Urs< zZ9B7B5e5$Pk+V8+plmvtMWL)H24yhgONWdpAZoyhyAgQHee4sc$fDvbuM8nY#ERe; zq=8KA#u7q<>Qb#A_(o1_!yxZ68L1L6W|6pM>d=Rf&DL77vM98`bw!a-*B8&$@A`^Z zcC4pVI%ZFle(n$bfU4M&72gH%Jw5(^bI3}fusXq3B!`Tt^gRiMY;r#lC5U656y*{{91hK z+ru7F){3HG?2S^<>c)Z{=8Z_gGB3Gr2q~U_?@VP`{}l)JcKNRCL^P5FJkeJ6H#{E^f$_x zc>8?2nEVrpbKhY#Pvq1*QOicOq=Fr=f2eB9P3$JjEj7()syoS>_b!vz`4|aIg@tlv zQxnFBDUgOOjtL(C4HiLyJoiOXlQT{3*sil@BBlTZqJTW1J0z=q2<#E3yj`iWP&1(F zwu?=Ko|aT_JPycWLzXK- zvBas^LzBc7JK0)B|9bxS^_`Vt6ZyTG9nu%4CK18XN`^&AEfT-({<1NX`2d;xa#!Hv zlUd#KEo;lG0)5gtYR0T7_|JAPmRN1W5o`i`K$BSj%l>+zTQYjNF>wG~gI0(CGLk@f zMi;@D_T2?`9k`&K#Qi-pacSP;3FP^N%-24Dl}L#VHrZq}H0FJDHAK{ue`+k$90L6U zRBql^Cf@$P9-R0rj{72`CbBRFmiJ(BCcdJYZkF zY5z=oTXka-K62>zrz#KI$66^;G0#3Pzr+6~|8=sLIS?_GtfsSD`Gx*T*ZIxAGZ8HC z{H8ye$Fmd0}e~U|68WIAV#`on!DWStw6m-63(j=Y) z#N2BlG2vjVk*p0OzY6SgNXH6Ap4^<(#O%o1cK#(`omrJI?>!(dPYHGYt`p#lQ>H6i ze__e^P!xdOLW7$*7Huz4l_2i>OrzTiwWSbZfo@g8MWG#cUFMq!troaZ5 z{EV|YrmUOfJE2lehOe6fbqu`>Tx1&E-X3(zEHC6Pd{IHe4}oRi18|OAdL`i-(EM2t zvP0bRHK36aOK%C7U=w;I+FvCh0S>s6Vy)}^3&4eZCf1aO7D1~7PgzvZbGIsHg4J33 ziiB>GrNlG7?*gy^q$#isECXxY^kD4Tomq1{Q!t;1Z{3GJ5@4k3Fjb zOWa7btqiot0Yvk0&1kfk+H;Dn0c;RqJIgqx{}~v{2Y$W;1Q~hZ09XM&DQNf(LFZ@4 zx)ua!|AIeD!Rg1s>nF#gm z-(LYfbBM%}M^?4f*tJRYt9OPvc)%;U$mKbv{aW5s@kJ0ZlCkV>I{HBaT z!Ejp#J}dw40c$kKBhs%A$p}J{MZ1*u9g|!(2|E~ZGz&abVEMzrL#M~ff zK-P@^Q}V+i4M4=x8ZVXUK*w|doWtEd{}%9pYi2^8-@ZuG7bz!j7F7Qn0&7%hv+NDB zI(Apus7`(F_&*82CXqh&0`;+@qb2Fxgu%)>VMAXU12lQ=2{wmXwKBO~bfGvIgN zDX{F=BVqhk5!`a*YcH0@-v!wW;95cYN#q*cgX4s!y% zyxnugtOl$AFM(fyr?w3v<9uDev{%?3i-y?14A=wS0>2YDE_Ux(A>4a`5lkpspe^<3 zRLxW1-@pq~fK7Imj&cqokY?IU=+^*N37q-Vwo;|+t#J2Wkwwao>EV>_=V#y#;8(+4 zPnf_hOK*;4GJQT@qw0D})WHL#e~CS`paHWUPD=(ifHlItcKFAAD$}_CVXq9h7?Kaz#3trUmE?5e}>qO6tbG!%Cm4CSOuPwe^wwTERgutif&ak zEB|J^!e_t}+vpUtfHfrTJX#Gl*Uju7clYG*9~RqI~MACHthF6E^gd1xTs9Ls42$HG9{j zqxOujyB{=dR+mOAEB31GYI6GcE$}PwU%+#-;0Y~tiPZD+5d?q+m$Cgt?YC&!BNGub z?d#=bD5azGn%eul!hSYbxa(s}W`SK~_ zVV8dc{#MxDG>=E4!D}caJmfyDUw~h`_F-A}*2Bg1+s>bm@`e0A zkcRvkunN2Yej(_3Cr4ogm`FNytx_2?c3A^n5VZV*?gFZj{v~M!g~UGYy_^Dn0sH~H zwDVi)W`W4ZZ3j583$#YJkVmSrQg*;y$ba+n@4B|pHDq$4C3XH-tjA~IIq)0s8}O!U z8)iz->!u7$nb&_HxY>?-o^>rGJ(OQ=iAs%G_1OU45%m0T;8WLD^`_L3kf(c%kcl6G z=fI!bM!JLirzZ7cPa+_T;@9N-PrH^uRaplzxsQbkE^K~F$NP^i?P2t{cDGolXkq&T zrT5p~^QZFrzQq3tNw^QJ0q>~&pL8uWJR}cW-ZP}o?E%{a{QV2~75J-n8)<_T^o9%& zThjc1n8z=HUzB_8lkxr9{Bx8{;A5$52O^?cm5X*pOt@N@?w*3)a1^bHQ+M z;?V=(BXAKo4@_~Er%vgA!F}qj-siW07c5bH;zr;iOWs=oI{op;mG&**BMa24Tm(2ck zf9gz09dP;YsDO6w^8BxX*B1QiXO$`LZD18R5A;jb)8!ps0rSW0V|TH!D^Onn&U3Rw zChL6P0xy9#N8*kj0`pAKf8_InELyljeE?np^S$)XfX_feMRA485+VSvKv2JyIM;~h z@VaZCx_$P5dD_Yi-~#b9tY>K*cmuoy-u3Qs2A6xqz&3D|=N|?Fse8O9q{DJA{a2*K zaEY+Dt{_HymnP57hMgukvyaGNP39Hf~|Lg-x+^0Xt^Y3yW zy`%qf*TWe-;saojQ>reV++vr*(#iDS0-jOT+?17+8Cymi|GWU+9(jI8e^=sM%tLZC zn0J2z%mW{QwXOzCaKm<$FF(Z{0D;sE-xBn_dL;k6=J~JYXPpn&=wjzTnt$H0C*&hA z&5WjN%cydl`fG|{`{Q+e%_uNKym|}doCRAs3uttOW9k!}6^&?qmVhO#eH{7mxLIII zYPt=!ZL9!mCxhw?5cGM9n+;N~Ub5dm>HKal$TVM}NR_xn9k6-w`F(Dtu+_E0j#ij;FR;9 z1EvI#s!Gsy*C}$+`OD;=X)bq2*ygg%D>&)=0jIjBS(6Whe^#}2eB@^kn4$=bxj}V4 zynYtYdUyVF+^i5u)8|H4@5FKEALIE|fo>5|YyDLInIiL#a9M?`@-MO3>7?`9W{GJo z+wkdtxx(+p$)Ii?5y_?m6w5ky9RKW`@canZzK?P_N0U7|>%iuz^mqI}$K?`5%Kr}e z=M?@AfpN<3IM))_*eU6HHTF+!GPdZi=Egzm8Ry5$Gc$qrTaskrX;Oz7dNCI2&U{f-u^N*dsUjU`5 ztTUYHQzrcjUq}C_M}7}^iqrWA)O=-D0H`K_-pIlF1lhqc=GmGp}w-~YuQ>lFLS_qQ^| z7hj&8Km70fzP^8aM$ytwB%Q`z^9pC6zjxBHtTvl`9sd_(g~-?Ne<<|-n)lZuG{1uW zq0s+p?mra$@s<1^e6{@hvL*a0=^qLGzvldb$nSrPf4=7O(3am2_nD@3!!4)3Ng&HO ztx=*%kcR51&kwL$-sQNP{Bifcr{6@0yOb)@@v_h9y3?H;*K*cRBOftwxog8cCx80b zZ#T$(Syv?9J^T6l*)?V!>K?FT(}8C>!mz;W)EOb|pT$Wm(LEdB`K``(rXMeI`Jml@ zgLSsuv(Voc7U*M&;D7)Hr=P#Ve!l_csO{n1vuFb2pMLrv4Gu%?bvwC7$yLD1>les9 z7vAlih5!43VV*yfxlG5ur=LHP=beY&WR~nq^6MhBLD@hSvrfKymh+eSaKCuCyJta6 z*YgicyHd<6Jd^zS++EnunG56pgR`99wOkK#;K|(k-81q}nfyN}@IIzLPUqhM800QC zlix#r#lfl1SES#y!0>jDD)e+qVDj4!ln~;7%TYKtoU{aNp71q;z&RG_#+hQMa};(7 zSOiv%W5W=b0WJaOSfnbkJ8Yf97>g(5*gPg!&k?k~%p9dF@+~q=e(d>&`209m0+!fb zyTk#@wG+-i1zco>W<)@<-Eh+auzX~N$$fn8VV!20aDY6x1uOucc>ZI>sLb8v?g;&4 zAj>f?oJ{{oW;e&U(cUVbZa8oA1R3Va~`-LIbx5U2lo4GC!hZunSWS@z7E+Z@bLutN5cQp z?6auTfmj4S081z5_(y;_;3Bt4TZ+{c**^ixM^?z~GYHH8=eW(gp9Wx;^8fK<{+}ck z!L-1JyYAGFL`*;B{IkGC!c&_AxhEoYlKl4b`SV=BoUqqyi}QG^C!Bwp@;gh+XlwHL z47>*xPtd^uS2?=^-#DS$YV0yw06qXKCzuiU{^x)>7I`yzPwS-r@^R<4-#^bLj{+OS z-DKKlVC97KoBW<5|5xDQ|0x3a4tNf{JQ6e-z&LOdxDVVVc!XWxHU;P<@a#y?%i@+_m2#piLp+y7m0eA`no^16}|xjs(4qfD6P8zrrqu zl5}c1{&@<#IgWp3ftvz)KOo)SuJ!2+@HdAxXmt?K}o@Eu*+ngT{cIYe*(Axe9Py@*sq(g zNcaYL3Ools_3}>{xB%P$ZUR?;8ChG=0M>zzz#HHh@X97cy68VcFs5rfzsEj*_Z4q} zrv$A(!Ul{0SAaXf9Zp^QY;&xN2t5Nn_4*J+-~w=m@;|{AH_!oi3;Yc{1Ku53rw)J% z)IS&L!1ePAU8klSvHIJMSe?NCSECq++yK5I@{Z$w*W$OqHhI_O zcQpOy`1@CA0E-B5jXL-udmmj~ve5MfqXb_25x516vf; zmm_0ORpCUJbYDvw%)1?IQl;V9s}2DJ^R^Wvd75a9IZ;X3{{H|T5_+pFII&IO0x(DZOS_V(}pGn{r@ICN|u+x!riw`omGQ-r(KPJETfy-o7UslfTl7xML-_@#K63pKl z{Cj0tEIS8W5R|`Gul&vd_xb!x5BQPK#_c|8l%whYJ-=d-%pY?_&^~aEty>=OUj67$ zV8ibLa}*LQV;vAt#wnFoUG(?KKR*!oaVQU3-(X~Cy}zHj&OZU%CUEC0Fv%__@cwrh z7M7_$J3deexB}b*9soCibK*U^{MjrpE3JReyXGuRe|}HcFIl9JSvqfu8bYZmPehX=Yvx$T)Bvcno|8To=%J zpFsHfN?$P_G$ZErw?{6`2l|w1*&j@fGcti)p8k`#(0v=`u?kHWF zrqbWZuX6yd(t9ZIBX3n*3tNhmyhqJkR0TPG$Sa1MAt*yK67 zZlSFDwOuR&q5?a)&~e}f@R;6je+N${1O$o6ZBaOo?U;3A;9KS#6HQr`f6T`q&;1Tc$xXiqBL-zYPP3F=;7rE+h%NWp=4z4otEamr2wPXdv z6j{Q(LWlk?`S6(GLnXgrH{iC_5-`iQ<2A!w9|SG}w}D#>J8fTDLbAJ1W{@fQZD7+3 zKna)uz5%|Wk5iD#kN7$f!@?G`lDl^PNhaT}0Au1fr5yP!%FtPrzu&Hj4bghsXXGDI zfmkmV1oXbihVl2Ny)OXwfJeZU{KC$F?n*{8lnVcJdJ0_f^8xS;!#~?nbs}G<`Mu#0FQq~e*i`yE`MoXl zgRBZQF0KG`K*o~Nw)2CBfJ?w#TF*XN^_$AVVE2dDWVP&Plizc|x4hD+U03;a=f6TL zlgPdTpI_%Qqw4ChZ39qb^79r;C$?EALV*7>>E?-6s&f9l>e%8YgO@2Ti?dxnnY=?R zxJj-ivjyM+@B{D@Bj-{cdd!?ayUBoA_4d%m7|riD#I4uvw@k#q6l;wO4iB9IxCZ_LqkJ!|8KBxRmeAxWi+5A z=}XzLjmbseD)1BV18|9vr!NWYtmplk9R1pM?$3STJBdAlo{h(mtPyg9gO2_!XTWa( zkAQE12`ZWvt%oUKTjpZ-w`Tlv4fqLo$c3M=FtEqV5A%>l+q5B=0v4#& zvc5iHO}agFGXdNK{sG)!<{%UUP>|P#Y;EiCkISDq_w~F){%L+;hnHX4v&tpMm;oLF zKLXcy|DpJ5@U5_dWG zh&d+TEcUhr%p?!a7RbWebvyqs&wrgoAM?004$s64#=0@t$e?UFq`?W2Yj$_a>0Q&n*qkXfei_Ck6-_ z?mBn+rEy+q7|9Oh3NUVYzK96olN23KL?@L^)fqd~T&KjefgTSr#W*fWvzF{BO=*Y0 zCcO#H(E;=;&NDnl{`^5+jhBHD;2eLi7!t#FOwR8KBu*+1mw;jR=uCEjZclQw*B7H_ zY9L?)Q|4E=JY(B~on+fbK}0tt14iPL*!#LRi!dbU{3K(qQvMV^7qOO^h!`bkxKlM9 z%;Z7lCrx#SG}?uH95BZ`ciXR=WPQMzKI;;-L?TQjU<^3VSTFQB9m=F-&Tb%od5(%h z9EB_!{b$(r&?lLTRJ;e9c+B|sXN>>t{KEp0r1$6NyR{Sd6I{|IV`N-!JfszPx`*W$5^IhV=x1)4fwnItP6JMD`B(+{o&hWM`S_D7o`Y zbJM&zzFCj!m2xTWB=h+l`JLv!tFLD%4Ab5&jv8m1&#=*dnwh%-f7W=m!0#$C3p7l9 zH3k#|o_~_DpWhQc>0LVhJ>R8&W`H^7*GqZf@noPRVLHy$Lq^>9mI*pPBHLz-Y(bAM zC{hGRfiU29hDq|XgaO407%Mgyd7fU z$V4ZRMU;_1kM}blTPUE_Cqr!zrEk8seVxx-&Jgq2?XLX^w#`)X&U%;oVw$v{c~&1t z`+)I$<_Oe*<$N>)<^$b0`v_1hqA|#;OPDtVQy77S!!{Q;Ad9KnHf@}nMM6 zidTTGfc!H_7AVNITNWgy_5e##BjmTZ3=Glw1yY*#gl8c!Wl?bAqY~Y;;$TFfsqAa(3862nxtI&*uon*34l~`t6=5$*UiGDZtDWfnp=tLSn`+~|9 zga*GZd}ntM$|R*_0#k7=+qZ1?-m;&Ny}?Nyyg?0E1T-QpMkr9k?7#)F-KvixdP8Bv zv56w3JQCk6l=bQ6Q$?~Bw(ZpPv%nXSY%N){pGq%^sS)$l<01>>8aAfFI$as5j5YZR z5QX&X%lTj@llb4XlE=oferEdGC$Y4jslHV30=>Kk6Ur`-(5)OdbLI0QQnCJ?ES-)8 zT%inv1$rOlo@_}X!kAJGHL0Q7dDf75hJwT{rm?h=r0J+g%A$#-$INb~A~3#0gFcym zfcOr@0M$Z1YOu~ zNV{oiJLH+q@~3Kv`c2;j!aotUg(Td=jvcXMg!vkKj>O8tK9!kibqJs75rt>fm(xz^SJ=O>lFggRhBW<;LC0zmObdH(N|CIGm~h+UrW~S zcTZ-k%V+)~WLdXmKNDF78w*!nshb_LX<@IfVYWi#0$v-f!LZfpH9&K?HoR! zOc!X`HR&HvX+-2_bJ077!IN)|#FbC-h;y;*!%gi;D+`#*XiTE)kjP`c{<7V$WezP3&3e-!~%z8^+%nX>-ogl}~ zx|k=DEVZB49>}XcP<~Urf>6K`l*smQn0Mw{_Fk%xLwWO{=O9tuJY{-*mFJkWEFkB; zLozyM;ZsivJz-v!?4K^B=H*pR+e+6Ndt1+4BquT-BBNK_5WSY$nv+PdXM{AN%;f$% z+^L!xQi#^7QPuRAG#*Ymyv2NOm?iODt7Ika-ETG&Q(QR>86G#$S z6Rt6f(KgG2k7VqDsu^5Ui?{@);!~*rn^ue&g|=j&cnp47gWAWan#k>2O)KN#tv!du z8ndCZMKxqvr6gplaUnV6Uz?C!ZmqnQ$VBd2dD8{rR=0n(No!=pci$M~;SZZsOij+E z>;cUddn#i-kn%4bQp)@CNvVXLcrA!Dd*VuKK&R4^Nqzm$=${BGMUy_c34j%F>?>l9 z_S8Vo@G3D=0uszJ-ev>ri$eaL24ov^gBsI3pe{>dmDxmg;>X}&4)b{ybwEr!)uC*= z1TjawlykiN%XcL*n|$Ra#2cwI{}>7hBH7p1wZ0k`PTqb}0o<#mRQ-UiekRk!}J z#WfLizQ32ZKD~CcLD2X*sLytQ9iU2I&!jb_o=l}BvngvqnkK(Z`Wot%sv1v(9TVf9 zmMpT=Op89O$4N=Aw#qY3Y>H*DVt48cfCaK9qzBwjG)GOpYqXu`{uQ{CZ zrfecq?Qcd$t(nK6qCMNolFFW$Dx=t#;0?ayMe3AL{rm0nZ*a$f&1qQz82M^WQ$WLa z9q98mum#llHxgkzlTfuy)w2FRqxnsE=goa4aGzsprMAt_R0L>cj0 zJW(K9>;kAY@|S7#!lq=K65;-sZMg1{_LQ#mgoZ+J3*@yWH5QE7eRm^u*>hskvlc{{ zGoXs~&_K6qT!p(w%vDb`c2PPq%n55z0Bhi$r%lI<6j7fYo0;gnMEHN7IO#Fi*S*hd zr)66bv5kS4goGs6a{O-&kvdRa1t7C~v0Ppie3@?3dYkye3KU;$WlPO+q5%(z+F!L0>YB5X?B;+lnZZX@-j=}{Bt8AozAyB3e{ zv#)iXV%W+%)Ukvp4X%~wEG60Js*-gY>rA{ZYp026dRBo?z$dr6z9kKniG*h=#u>A^ zu18^$OI_N1^n2Xpzb#$=VLmP^P?`DcVO_QJ*9fMu!0zHiI%du8nrBku?*1JhOR0_A zlT9g~+u;-XF^7Er1zjq#CqN9=dDWt4(FykGdu?&4%eFgFjjMV#7*|+$-4$@hv=*zt zXJFGsVC!6RvrkXU)R>uJ^MDS3CEydVW`jJtToSb-wU&(9qbQUI72rB;PTQ$90ZzpA zN$6Q8`PQ&4u0>b@>M!JguDe2KD*8;LEAsMHs+ZBzCfLBse6mGZSpbUW{wSD%g(<~Ach_CpPf7r z|1=Toh-C$Cs1nX#2gs;4d`YcALh(B3wc>WWwYa2dh5Vo7kD#BF{JU%p90Utvn-Of7 zcUMex*QxE7fmIuZJ|sVH(Os}4it-h-q67SKw7Ee@3&ilM7BvZ>M;cK_*s$W<#w z|2^&r+F;br0stZe_`gax=uZ2ApwEkz0`fU!RuPk#NNUF9VUNhQPYgCUWn;i_z6sl` zxoM(0LFaz~p4lW_B7=rDzzUiFu(GgI0Pld`fxm4dP|RMOQG$j?`S{Zp+rI|P1Ai)T zp(j(a<$SlS)C*;*cuv3HTNG#m=9xGuj$SG!PFu-dNkzzW{z$)Ww92 zaU(3ETCukvLlDmTt^qHBC%|)H$;m}adb#=qwp+QteVq1t2mGPHi|+UPq>CqzJVl+^ zITrzJ055?*fqC1e&;rWDnHiA+Yi|msLF`t?KMSURTr0ph+hIboT_EfnQ-nVOe<^Tn z%HJQBHeidoOmj2|eEAA^qV#W3|BO04e);vAl@plK;+ikI*ewX+|~$UC9{m$ ze;Ig2)cOr$t4v&ug1m#Nj)VhvuL-;Q(Z&ER-CGopgHSwR3rO7hhChMdfO%a7(8~9t z*(`@AzFU>P``^Hyz-v?Bb>=zBY~)GvVQWm^X%%=W)+aVak+LyofI8Ux&5W;;0=vK) z;1}TMF4$PJ!TszI>1Wv9WRm+(aLa!Kp4%z|>l-cdpRP0_c=;Rr{ojf2*QS3aQHxfi zt&)*3_Id;SPT1#Ns5Z(o})Zz!Tu#z~5c;FY@vusq9)ttHHo~ zo}lv!O8+DuI7d>D55))A5$*X-S9!E(F)9L;WKL|r+Tk|vj-cT=`AtLw`WS3z7SO$j z@$VBoe@ZJkKp&)@vMc^r6hgap!Cb-0zZKZ6}H(^B9wbcgR1#0)LwW*5u_` zduEXT$K-c~`X|T#DGSfm8(Hi$#Rd33a3A~z@B#P>_!;=SYaw6E2+ukkY$7J%f88c- z@ozkURc1PQ0t;T7^c@=9y|4Vgacm6+sUx7DWOJ%L1n25z~@Xs^GKJ&nD zz%Rh_F4)~J{_%xxo6LUi0PpF2JndTOYMmD)4r*5!^|&PODzFH=1^%S(*G+#L+p{`o zAO_U<|4-mQdqLNm;w4$tQ_Vw-nXf;A{{sH#T83B4cM(BiUn|)!k^g@O=6mU17ZwO4 z4o_L7TqFIQ_FU`beDH%OA@=)DHkm+)chIv1$f(VV;Qb}nnKty{I`ED@)+TV3Q)wZ+Ij4Z$ za-Hl}*N>-MED{5|+^8`k`_T8e&i83A_1$L+c+M>AI#-I7sKfWU3;G%GvP*k90?xC* zc>_4lxs(ze_Z47)xbyFNSE6UYBFooXz!eTARv1pZ)bA_0AB(-`-vD0l2&-JEnozgZ zXoKDo_xlL?H+j`JxYGloLq7p;fEU1<-c^(-6>^oJ^>ZA|bxv3vSOT5{e*!O$r2meb zf1RN71u|xh3g8dm8L-f`61@d{qR?#vGjx9|r06P3Kd*o{N3Nt_<`+40eNhH`Tg+3x zH-M*n)$3dwJH*244)6hZ1H1xW_40oM_`rhh5^$b_ z=|u{^qu;9&) z?Y?3OSkpzyUG$p-E&!Lf>A+)??514a{&7F<0b?A?pCtoER3OU)6IjRbpMGGj2T-Uf zfEXW60Msaw`DVC_x(|q1qTB4Mogeub=l#ubFARi#J`zymSo-(#{VxF10vf)<+WE&5 z07*&$XmgHxm0IFJd^{BpW|V6(W@I$VZ7y0Opwdb79|Y$3{4ibFm|E!*-~VX-34s}| zX_yp5vwhm&&nKVX^QT&IHGv-W(f{Ps}<=cs~8KtlO_Px(KZ{t+-i#P?|e zGJ*78qW(FF|0j8W6M|^h;Oahie_J^IGXTsHkY|ip+?L>ZESx}pr++T+dFKGP2!`>I zjTW^Net%Q|d8U9Nc=%tk>2Uk(kdOl%NHGL-s}6`tc*>_DrHF`b%^q9r(o>%=3TStU z<+JM9Aqhi5!VPjb-dY8>&H^dd$7*Ch1^9qJzJCJi+NaE9XdhFO4S|HLpZdJdf@q(N z2U_cH?bPROGQ5wMSLZ97iT;s1A2EA-$X7W1_k9+W`$YjC5*6`u`Wy2NaBg{*O{Uc| z^3M<<6>hpba{~^!s8s`trLjST90}715t@ZeH{s%X`4D$-(uGY=7{Qe-1;4voLWrN+B zp6{1}xb+R6iT-{bk`$}~9egPyoPN?FNqChOFb zSSIaszB!VT*2r-Bz4d-V*$&eD(y#FT`=H)|j1hkg{rm|4RBR|9lwN~@4+)0F z;aSY2C(Ku1cvAgR`d0)rzDfBx)BS@TGnG8N%vtr1C(6qc{;i!|O)LLo^88u#PaxZ) z0$ySD$n$p_fI+TqwGcg(E}dmy{bV+BIl5UvH_K#_V+q(inT>s>=_X|bUQ0mFPs#BP z08`u~IY!Le9bg?;1=ejx_HiGVCY%DsxqIFW;0}NPBsK_uabTJW%6`CSi_|(X0(MS# zeh8T4`G*Db*ywi({mZ}vbDX18AT=h)*G@fu$TG{c?2oLoakhM1Z$na z?b=!BU*an98P*75<_FgK{wIlnD^tu8^n8GmbsGd&SU)5EU6Czdm)DN)|4Cw82F7^) z5dp{B<#5vG$!atd{+T3ZY?afUYaHl3>HITHFb@#BL4{}Y@%c<~uW%mhl=DyU;V~|R zZm>~k`BeQg#LFKOd_(KfbMtX0$*=Oi)d(E_Zvd-0A9&=)>7OYsL+zJ3_Zpwy(Fbw{ zU;?;G(BomQ^b6Rd_n9q1pN`YvJpT^csY@9=+$J|Is>R2)F{=B5dd=;S;Ll z-goTsIhy_z;3A>FF0jQSB*d1RCVj;j*X~aU%<<)~05d?D;}H$%vY7`y9mm7Nz(tba z5-`phi)&L}23`{}{%9Ri;t{?9u9AcSCmdGU={kR8mx&Kd16P3Sz-3^L3xsV##I?A+ z06rhb|3koK;0ADm@YEjjTRXr9;2rP=_;4Kk%fLn87Wrp_V3$=X@_q!~9|!tA4O|7j zA@tY)F&15g_apEcm*AzH6*fy;D4 zO0v3n54b{{k>_2Y=^*aXJ>WiYO{(-Zp@*Wd- za*(Q{$*|sXrC-}O?v-6_MU$LxCVR&JOr*2dMo3Gjzi!o9g{xbd9M#q03$X&LGG~8=8eOd0T#OWX9lA%gF5>N$BX`-RV3B+y%ZPaO0>faIZ2ZnWhf^&?~X zn^D%!sK7?EN3%09;2OQms?YcGe_3GLuMy8FlARef;07>5udpq@p>fYWYR{X%Bx@@+ zG30TPL76P^R%4k#a{oUFJXhi0YjA>YoN;+U_U?29@Eq`fz=0Pf{ygNSrEwX=c4d&jBFj zAb!Z{tCyNf@X5eG0uO;%!M%@&GUaB`2L%(dL;o4z0r0;79sqMR-A#HhHn--ihjR(u zDgX2X*MXl1Txm_tsf-_w2%HRn*DAwy{-1z*M4q;h&k|9R#`yk+%nNnSe;@c4aF-7U zoc;7z(i)LL{`MBu+cxvGqd93}{A(gdD&(f>Sg?;ikD%)W;lUms*@XpHo4 zbn(wE;3wcQClvZ+T&thw9~M}P56VAIe!mC4qn7BCg_|DFKSBirrdHdZfWWW+0`34~ zG`+U!0z~QSm$G%|{FA^f;0NG);HvD52_&UHM4Tn}{%^YY=PK}+z_Vjg!^!v}eUj2o zyC!u5;3DuN<@cQI?6C5FQ3^}${X6HM0`39-1bzT6kaHsntTnBTFs6xtcfI=OM*`1| z%7hZA@cIM=$M2d{4}kN)BeJkr*@UXHkJ+sX58syG&{5{9U~U?h+N5tvlp4zj>~Zv} z{p&NpO-UXF`R%a*H#n8)P;Im2GEoKikmh?xb`Hd3Lvv_TSvp;H=fA?(_7d55dgQL*Nl%V=MV#{Zt@7 zGq&vREC0*^cNn{j3PcxVYh8ti&P{HsJTM*R43JYHbF}-A#fR>@%F?&8tLMJ}JOCar zb}--9Kld2BObP6;$1!D(`<$*aqgz!QG6vja`eIsg53yWCLKk+$4GiMo!jKL z3u-qV0Yl&-@EwDfF+p4Lcyx2@D1<-@PMiRKG`*u%0B@KSttKBK==4p-VG0U=b_xG*27A0i%BA)7>~O&@&H;~j{&5C~ULMxWlPw%z zTTGk(hk%>3zT@lJ zZ!&1MswlY}m&|T=-Z;nCnl2BxL@l5n@AififJu&`jevf^1aOYNQ-sdPG=3;MrAC3v ztTRLbFacZwE&x+}3v>U#*rCJ_ZXCD(TvpKML4vRkb26tUeTQboG8$1k?4WzTLeT9| znNKT-6XTJhMIvNhB!0Mo3UTHz~FP(Hqh2b6#zg3eDVX!ulnuCFs%nB{NIzM{fu z;5h?r?<45^m?Ukpo~0}1$_%s3W(4qkg1DRGl!~C4-U|3FuF!7DaDX7(7l09Z%`x}3 zSgU(RKW=~s?K5mMYJ-Mf%4dH39_?vCnB$krz!>0r0(xGOza7b@dWav~2h4Dkw!=T? zShyV(>`sUu9aD!ENv~PsAM2d6yuaB{Uqs(|A_Zb(yHQ@><;LcuZO4>$i~qzTyraNH z7OC3jcPw>@A`yz+(pPYvOaP2>6m`hxZ|4b_bBXdR&LJ74LDmB&fgoV-$z|59EbW{~ z18kjfRGEO$md?NkaE_zp*0$G@Wq}Y76373SRR1{oyRMA_txS?9UV+|Yi7eFS|A^0l%cF!Hjq)yA#3x(i=L$i`U70*7aaZynBV><4 zZvt#~JRlFcKEZMctr4pJQr_O0(k?Na56OVmB)aW!QL8nbCi&qgPh7c#-#9>R?*UmK zFe)kHqHKv)SGFV=Vq?~)Sw?k2b`un+^fM6}Rp13+5SXL^KJfF4etogto~TY!AVt3K zgq^>_p-MjwM43nsl}H>?WJp9zOb>+VuHIbguoE%Gkp^86(v|o zh$9g4?^~bk5|C8b`Zhq{EfufP%kPYJE$XA}GcJ{II4WByjQL_o9;IA)SLEeS(&5?l z2pX;qlwO!o#O~6m;c?}k3GrTBjjknn-->C*i+wf+TS9%9J$sp~scQ*LhbLZDK!?Tg z`;JF?jLK-WhAci=Am+E1ILhpN&BWDLrr5OFk|{C$ST{ug3|dL8go7~(vyHZc!qXt- zBc|g@ECx7Y9oT)2F-dLSgZU7Jd=N9hR^K6~Zyfz6C3BL|<+s;1m!<>^^7n>;VhOd8 zJpaZtlqL>=%mkPYcJ$j0Sl=iME=NJCZvmY#PX_1v+2rX1scU_mq625D3i*!HRB~aX z>~bl$4Zsu+RLTbo%D}#y#OfPuhH@`7l;8JnzKfY(`&Ua1_&`!%b)`sW39@qAg1;tT zz@r!yfzTedw2wKPkOP}A*^!;2-oBioKv!q-7{nxIf_W3jq)4MI%kl&Hdq%b*Ra!_L z-e~eR4Szxd9CC?TKW%=iLa=~>Cs9tG3~xuv-%?3xPY?JNOxh&hKH(D|}JZTR9xCz3@Ah2Mr5>m@duAIW;v zP*^yWVos3XB@z|5n4px`b+Kd)?M6VW)EU!qGiQUc*RM&oQS0sVE)wccyYfaE7!n66 z;5j4eK#%w0vpf+os~;A)7{p>fbJ}8w8qB3>F^+DV8e_(OY5VEQ*&65h;se;ESyHC_ z4+#Huxh}|W(KQlAk?e==O7*mO^G@LUS;i=F(yM6lYbC}~-f6KWf<#u=M|%H2T;Swc zfwtB>wuHO3LXm=sOqv!#)QV#F6B@0M?IV`<1z#tV-OlXuFexRu2GEB;dO(_4J2t8i<@bFg!=K>QNPjt*h?0~426~-J=23JLlE6}9p7Bp9kB{1n%U~V) z7pV=k;?NS`)eM-GH50oX)8=L+weV=MuP-H)Sa`|Pudo!iW-inGy~KIZ3cI+Xt9BfU z6j?x+OHHYx2XU9I2Ih%vY4Tq6m*h=){7(B5(`z0TcDqS9W6q_4QjM_2|6~oA+XkgSTA3UZcCDS>Yv3i4Z3^(l914{+G(ZLFg2b*+0~}v!|C)|Z#KOBeK@z%#EZ0*uKu@c|%^Mc*_~HR4!ock{lzZ&g z_aun$B^!v`fz&=pRLry|m7Hh1shua3gS^_o&@i`S{$b>5jB+R~TeoXl? zmM*jZV|jCHXYu66cYp&&fa1&-K36I>NrHypl(#@{V4tpOB(@SgndnqtYj=QJi$h|Ege{5W7ENPX zypnRcMTh?Tv|gs-%&Is?$I_+|EyTXIQPu?Xyuv1*l)65Y2uP#RDo{1h@D1Pqn4qqQ zoZOe4p_bsOQQq3Cuw6@u4LcO_73ffuOeV`?tB%H1o_}9pN@IcgEeI_^`CD~sJ(^UV zao&bQemP){#RA8S-`*Zl+XeQi5LNY>N!Gk%otrgw-1$uf?n=|(qSvJ37_4!GCl&}oKUE*WVs85!}7V~L0$q0OOgFRhXR_t*rUNFpK@=O|#6 zco$fAFHq;0`#C0aO$Yhn#F|f^umgI_e*ci#%nEOr^wG9tA}6B4ay?YL?!6AUW?-jh zfKQ_V(`pBt>f8nzZXRhF*al|vi7CsaH6(a#0_$8#bLhBnlPCb^WqjWgug$s$5&-Vfz&fx9Y;yiiElw<%P^;$e0xQ6}E#%na{nzr2 zrzgv-GVv1kxx0Pa&Tr=#pr>dKwkI5&aWda+R@k=le+E{`{~ookCyCHNtmitH5>?$x z?f^@`GP}4!ahFUCE&OeAf@aMIk*dTKS!OXf+Oi4&0Du5VL_t&}&8J!%@e_)Ejrb)S zK)psNtShX8YpTXlJ=Pfz)8KC#|LpSoW0c=gzLaQ;++qXmDnZxV`oOIjxyYHdfR0H+ z0t(B@HwoUe;pBIV{5&L$WcH7hZmgJP4oua4f0I4EBVv65xkB@#-1E!8XXBqk_N4Bz z3$Z0Wd6B6(kG_ec*Jsy;*_2iC(@f)c1TVAYLJ1Fd=%a!_`@eRO`j3Tcab{WYNrMHC*W4Ab>I{54p>nI zv3m;1$YN60*9=TTH(6FDxGzpyYqhlHr?~< zMAF#3_&QP49qB`P5|Hol;cS|?YjWm*yf^;`Xhe&TAh=jC+dp!j2evXcfY^K{rm3j);af_^L#&FWFgQhB}6*98oY*c*rv!!glBNd zf`}b~adtanqBGE|#*EUJ)wwfs{jqQ4zs8FmNyZ^v$e`w-zZFxPLJpYzr zqRtNxF*wHYcL3~Hgq2vTY*v)1UQpkm4xC7R;V=yS;>-C^TbQqpPXj0w`ah>nRYOH` zRi|fW5UytA#A};_DlqHW(zplP=Vg;Ir6X$D+t={oD;DdTSEb`lK6q0pY?FWbIF9~> zEg-+0wl*iKFb0 z_0D$JTdlrP?8MeAv#u0`!AI|JV&ZcN>x+4-B+qKUs~~P1>rhY7ENym?(0;%R$vo6&*LM?hWtVyh*i?@DJ{DE>PV%(opp&)vw&hX3}OzrW*ng~`}=`^e}dz# zJokdaTts;u$ZkuaISuY9^fLG^p)@o7Ka|=pV0)ryV|jk-{o3ODT#O8`qtuz=nL4sx z8!cuUM-V~LLF8)|g@Z{F4i= zcOT+`#jB02b}#;&#I*P@IT@OKBc3txuI?FK+!8!<{Jz&&P3)*!AkOR)eDj7xx)jS8 za+8jZi}}H6>tGmZ`=>#&pHAw4f|7iTdl@$deTV#jaUQ@V0EP5TZi_9h}OW z8skJ_Re6ehEWX}O_jhtVds~0}0}a4n!pY8X0aeY^RqkkWb!dW>PpbYhAZft@1INi2 z`hKn8Br_7LG}^ir1y}ex%suJoF80P9b(pqf(KWK@4Xd%QlRrk82UCCUez_Xm28zZ) z1BeqESLJdB17V=ohXy#;+&}aK(3&b_!G5vSg0}T`E@gHFir=I=cUT{AgKR}T3{zj9 zAc)teU%%Q52#racFp)iY8=T`&FT zYfmDQ(tuD|MHZi;Hvrp?wS7ip=Z~}-Pr#!?3lLd&;d5FTtL}f2Un+&JuE|;Z*W)k@ zXF8`s(+Q}M5nHbmLcbsepkSg;=M3}dQFy6g#5I4z$T{ilhmAw^1ti5tvm zK+So=Pq%K@t8Z*UAT#&;Asx6N5)&hac|?e*q{fyRRSk#$_lNp%Nk5n|B(_r=5g9b- zw_-F7U*TeovJ!UXDrqOcXdy}V{%Rm;n*((efF#= z^{iy>2VNYsEVH>oB1VJlH+MJqrd8zPj8~KFPh(VY=&iUThV_oI=iS0SzuAj3)^Tw; z3jVRf4+p9DjP*X4Z}Ne0Eqc1vV_uR9**)`r%SxsqMN#U`6h)#FlXu&Zx~i) z2gd2eqfgf~TEi=m8jMITJ1sly=C3`y;s*Dsx1YSc$d_MRcAJaLI_(c#Rk_#7j*OET zmqFq2HIcJkpknXJmM5QWa+TUH9&4wxeMdI||tC-4^@o)n0)JLy$pKhs<-{R@n(Y+j&}u<-n+YVi&!84-tH55bjGOa|KY!ds6!&fmnBh4;R_XUR9hc*!^8&V^BNN3Z4jIcnKMTPA9%UPJ- zA*VI*_pxJZTO@M#Bm<^*tS6*)*m@MI^55Ax#MvqSemwO^V9B`(%w7}o?7vHPcQY+B zp$FTJAE#Xi3d9DsP3b>|xI7Bb7F2$3%uJnjS%qnGBt{6z*lwAOBw6VEXESV6-cOK5 zWmKE-ik$?n0~RkNKQOWM%Xh3p-UHz#FQ|wKXQTR}E&t_})1L0(x;`f8S-?1l*H{Ov z7{vq+{y-HrAkQDU^+I003ezQ!$yfUN6r7F1MV&ia1XB;mDRz86+Jq6nO zu%D{D*@0IE>(trz>KLV7+dlz5)!_{G~GfjH5ONbArTS$`6oaxNwKr z^75lo_PCwJ=u_mIYsv^tg)`6p5TkM?6r-$pskH9vmFw}q84K`o>QJ%?(3Qdo?8kqwi+?f z9Wh9G35q#6UiXTTH83G|9Myi73v}H+rq$G2w(r}0h^d|DbF6{u&v2RDo1)JV4}!qy z#xWLDlYNMtMt+?uyP~_yxF~+>SkK{GimTf%rr1AMd;SWa%We3B#A*Kb{(R4k#%M5* zjv6b2d3xvG53iW}m*9AgC;TWG)!wH8#`k+**_3D{B^Y2**fH}+aV2GROaJ{PhK0Z9 z?vrl=OXls%qVtVHQ{1hz#xX3buq73`^OXkwT*35~ajla_OkA&GqS0tvGg_vnD08PR zVa_$IWVxS|_nBqKww`*;=57(Lycy~67TlUCaKiZmN9V#HCa!?u9n>2rt+>dosm#!V z=Q9aL45{AxPrn6T?fs#hpj-z7?RG`Iq9@fa^^Xa)%mWyW(jO)C0({*WUxyn_rI(`J%`Gc;c6QAa)nN-KMs0 zla3z_;4r~gDJJcXEiOC%WiWNpGJ;A+Q0jo60d9*#x@za$u8)sA6skt>oe~bG#AxJ2 z+$#np&zd`R%S(yHhlv-oxWMtpO92o|=snFOJ7wlS{M7#@@lU8E?r_eitQ3siM?h%< z6U%^ingzD~uxE_*zw-K~pGp&WlVoe_)Tz;rhaa(PIeqM!TOc|uY6^klq_#cgKZD^O zfU{u3^;BI?4x=V<*v^@-cHi*^rE9I>F6I*N)Ht+x?36OO=opyBy=j0+?$ucsi*x5d zc^h&{IZt{Hvu&M={m$jlL8vIEE)97I2*GlXdXO8Iv6jCp<=?;I@nlQ!%|shaUBdvA z&7&^TXdF9&-FFT<#5{GCn|oV-!r7Z2H8ks0;zE)fIY@+AJ%2rQV(tRjx#wuJD+<>y z_)%G@oS<=>QWmK~LUG1Sk*{xVkn(poSzx;?aQ03A=r8d0JG;nLq-R_m)bQsUoluCZ z0_p7u?!&?PLjbWq!#<9$88}wS2uvR*2$u}}aEQ?tzPHI*8+8m@MYL|OpI@hsQerr9 z8cu_+%w8!qZZ{f<0G~5Rs;M6CV%ZS^6 z-5FQ(0?U=u**+ht-~?jCb$GPo^&B*by-q2A@t>x+c^K0KjxnIY>F|zB`2LVS&s+;1YNxKGuc$%^MMh=y3RisabWVuX+Be50%1^zQn(u4Zt&r0cT2n( zs5dO@#E0+?rMm4m4f=q#E>m2DF zML&bm!Bx%}!*$^rMf3i84z7?>OhYO`4PXUQg|L4M<;@OiRF6*~Jp;y!Y6ZL~P?0JT3WA>Lop!1uu6|+!`5}(ukH=4eTh4bde46o9m`8Pzx=G$ZJ;$o79 zWffTARhR61FE^gTmBc}$*l&9Koi$9C_;TwHQCj5=Xw8GiqYe^sZCfSmRp!wvgQ_98 zeW5yxx!6c-Cy-b3>zP;Uy(X+Lup0p7!*ClLPKyhPnm(n4or?Uz(LR;I9Ra{zWPhra zAW#~ZZ-+D@KBc0~02#*X0e=|_y)IZ}HuOb9$mWL&W7R;K+hkRhXS=W8n2Od)9&*gf zYh_pT8!xVbv2JR=r$n!)|HcUC{0*Nn7Hqu32cQ+Rfgj{b%;@ zh@$nCF`g5lyiQ|>Ci*xUba847jadWhebjs#$GKZ&?_6~c__bo-rO>CKVZIirU!&uU z$AyaEB=nFfMO+aS&*|MRvA4&O-h-@8`LKg;ZpfJHbfmfGTv_2ZnF%E=mZCGh~` zt)$|M)aZNaq<1=a)8*v7q0)rO$a@-VIm%qY(wFs)l0rSJiyvoWRY5lp65NRnJ!ND1 zKliIlCUb}z?EJV>$mu9_S0$oFsO@I~6q(RiOsa5vqkKjYXV~F=l`AQ&{a;Gj`y$2b zdnw@0szQ2DF~!p9UK1oUNC$w*Ahg3{n~KgD{gR$m!5w4Alhy3~?>z!X!v=O57XVR@ zB(NCBcp~#DOnVpNY<(|9wps-$$)#^h`zMHt53!DX6J^ITjZOo}#eo5oj6r~xGhotl zb_@$|?0o&P4u&fb$RfJs4@`_pHHX(TukmABe?J6Bp})PC_$`jLgX>}vf~~M0Ti2er zT|_9tXbL;`TcU`WiN5PBR#r@Ju7@*znHhO);SW*6!*i^D?;LTB^S>w|0=gi(qk>Q(ewO(P@x0CtK^-e>N!RrYlI)*?AU-YcxE5$%{UX2yyt1U#pYU%xEO!U)P z&j!b#^cv?;f#V>^mkMy-r>kCE(%)-7D;g5N?P=NSo}}9EVM_453MSqs`>NwvqL&!G z?omeH)?JU}2LIVBtKR!vkLeqw1f3=)e>@wf!!6nXA zA&SC}mOn@Rwr#qWQAxV* zMbAyI*TeAAhr`RC;4Cp7_cr>8=<0xMV&6m@(w8=mQSesR2VGRf#MY?A;`%G{I9PvR z&Zfj}UP}21GOzhwd>xK8o?NcyKGxnGnB48XqPBMr^q3i>>;_RDN8~qJ#k3JIp=c6ktq zz!A!zZvVxJY>bKB!&0P^9NJSxH|FPKxO5hl?|^3*2SpN2I4*Jhtia85_pu~O91Z*L zCYtvBfcC7~j~UwDs?jGE@d{Ff##*TV@{D54z*7 zLU_KwxZn17Ar)6dEM-P=)-|l66pT!X@1ST7Q<}W@4ubcofA?sIKV%=7Q>dWPF6nrPve>o7<<*-!r6dppOt+?&rm`CEr*4)Aob{ zS-M`9t+)dWHP*Okq#2bm;bD?noC4l4Th34O(oo&|{bh1|6UjyNBEC|^Z6>$e#8HnEZx5iLju~k=kWL!pO-H} zN)kgt)VRF4am88;HbTsQKi|SI$@&*u>s3Zg2iMX$F?JEI;`F^Lv|um5_t)M1l0g-i zwfSg-{DKqS!c)6)a%zr#&VJ=D6#K^df-AAg7Bfz%*{9i(zpW*i4SgTy0P1tE9g{+T zGs+X%^Lxc(OF{z05+}36^+*HE<8c{E(rjF^} z88TJHQ6%1frIV{<9w~sa;g%DM9QH7eyNZ3p7D}Pn)+@CbjI+pS*@Ir>6ymC2>v_cv zh>MeqW4$>+H3S4$km?Fye4bb0@noSMOd)qzIy8^7T@utvBdE3VAweBPS3S}w%DtEO z7Yh723AxL4YJW|tI?f>V1M^Z*YeN)L%n)ipQ+Mr&_9xm}ZEp+LW!Gm(#RoD7+`|bA zHY4gd2yYX3*j)unHR>CCp6@kvMg9@WR45kz^`-OSMM3{hwFak;=98vW?Go@u2hTGX z@ZBdJ26#323(~}TO!o=Jhu7s!AH&-R%MCPSmnNr|MR5+Wid3tux%8~|bqfhoybAtH z>yExY>!Dw^?7Vaz=%pvjbtjXw04{|ehFuVtXWi+6QBE^^@`m5n9{2s&)?FQ z5nsn~{UndH_*IqIrx3;Bg{Ko|`k$bJDp}I9K%R6Icx+x(VoVJXG+5*{ym^>ZH?91&BAuOu-ho$)pG@ZLJ0T$mFy!v7#6I`!LD&{^_Noj)TAvmzS0% zqO?b`2*aDd&x53{#UK z;})ZO{C9pBaNXr`&MmRy`?d(^|9g(m*(F|5A7=^P3-B&cCxa|;^t{vL+SFD5bv*nv z(5pXhs$iv^KfC(SjRnL7&8k!yS^_hJ(x(N8Wq~4?r^MY;}X#d5Ni> zry83-td;;*NQzq1<*ngC+x~BcO8%b@6Jd51RA{MYeXS#of1k`gHMhiTf2>83`LG}u z9cx;Mx?LYv^m4={2AD+EaOvL3xgy3dzfvJXHFeKF@nk}t?rno`NZ#h`=i))S@*H4u1LZioF@Qb>*QcR%jW?Y?bi7X>Y${qZcO zf4VI!6aQV2C>q=1iA;`Z_k20(roPkMx-BQN{S)i0(wxuBrSsYQl8sJ%L1=zbMD!NTr=t(rXd*1%1ey$m* zG5w=@QOWsDI$*}DPK%qk#4k5jehcd3J&;j->FDb~vE3M1e&bX#$>Rzh5lRgLtLx$j z;;}sFYCli!vjUMYaS=(l{<}7M=gP2kBm7!$yY=GajBJIi!oe{Q9AfOxPNcj>Ix9ie&W!DAdk z)DiMKzjn3p%B4l*{LGu%2Gx;I(=Yy9@&2%od-1rHrP-@My0KYu+V)e^e>8EE z{YClv$#0)su=2!7x6DBnX$lu6egaLR2j(vUaA`S;zZ@s7s^A z;XBDA$@hMqw^r=}`y~j7V0K<*rGlA?gh<8);=Kq0|11`jqpp+9Pd_-m;WuQ@`pEp`#&Yi6G!6A}_4sCQlB&%bhg^pfzDt8Xq@B8?I=O8ka##K;~ReTkkF@HvB zcjH;vsZ2J}tq(7_G;We)gdj0G+c^I#91N@UU?Y;ql7mQyl@YgnJR)3k9bKe`XHjxZ z=2IWy4yqZ57nwUfb18L@#lloh9yhO~qs=#=0a6T3kz)*5X974f8lQc!+xaZN9#J>Z zZt}TBP&58&w5F(-2ItI{T05M6!EUWI`aqfwnGer+**;#dz)j>w{-Xk?1+N~fNNwW% zk{A%=lp7AoDv&TQn}j#mQaJrC+4N&>dU0JDxtY;*`oxXm!u zPpGB4>^(?6%T;np=h?lbU&G;a%c)CTjZlp859)x7iycZz6g3rW^MYX{O#wlo%_oE9Y&w-d_Hckp4JV%v%G1dgb*p4b#u_R zhY(D$e!!|)joXBTq;8dDep)pZ%BR!$enTYnV~SK!V5{Z|wxK{;dV%|M7Up{^TMZ2Y zx7o?lTTik@{)c0;GJU7H{xs#b$Nn-l#i;%tnkDNMH=JVb_-%d=GkNXmJYX zq&P_<_XeV$!M1$!gclM5&AL+TVK(HkEVs}qp|8140;RQTP>wpNT!(zQRs!+{TJDFl zxJ5Gldpvh|y_250bk{$T#SoZzB)|=kAO5H-$lkA!=EIkB+ylQC#vX&#kJt!EK1yZu zlrFQC9qpV@R+SPSNs#`Sch_d8`9={hkM1&p&I&6p&iSQ*u8IQ9&125b<55nmKt-d1 zD2Qgp+plBtj##q0h-7s?*y+*{-(s=Qqt^oj*Rvzzsu2~$)Z>sq73QVK{B9~{lRZD8 zGNq2i{+fQu)sLs|QW7YeDkJEbsb8ROa$dpx4=WnqW08mIH$`OjC&1jH18<#PPafcE z`gH@_pC}@DNu`!LGK&DRyy6e)v83D_dd|uT0}YNJm~OG^7daMX0ZGe1cc@-)mZ+)B zs1G%ACuRYeA4f5xx1aoX^1`Tot5i$W{5Qu_ktDBEa`FK)i_v6n<$TDiEsQdAnZD_$ zZsm2c-g5J^q@Yp0+B!SButdLsd7|Q!t&_F#!T!@n)#Lm9L!6W*wpPex!i}^IX1QQi zMvVNMZkR=S!dl}X8~p^obT~^G*!=hD6U8l&;yOAbM^ZR{&N@9{+rZj|NQxVl=xbdl z3C5pNDS1uAf|KgfMbA*Ostg=ZjD37LvwA0(pagp?{Y&AcJ$aCuGVKi0VHYx~6WR!D zQJ$RCg$(SuM#5yCplpBaa1iQC*_=rAyL;~+!XveMbHW^#^CWOfyy8hlr%OFS8R|Lb znm}%^)3|;@rA3u~A4;~&?25TMm!W>qh<44zg?v7UA^mmI(wJGMb-Gn=?MFxn#xLhT zCKgxxB|Wdz0J7uVoTDss|02>=)`FCHvm^1*yw+4%Eh_c%8-lRd1xI;J0*+jhq52Hi zpl_ot#{Kb|%KFq-=B~=;Wok;FJ%K=jgN~~VZRcY{Jb*3GuV1}UA6j=$#;PX~+`T;W+`zu|VAh*<5=b938Jzn;g9IQ;r;+&*hv_QA|wjdhc>mD$hHz=qWd zzXVJ5@k{sAOZIFsN@i?Eu|HkYheT&H9YRme1rR7Y3ndr9Z#Mip*w%1kMexstmgj}P z2sK|-hkpOB4p~p~Y8F^v9TFGuPDGLuR$!!7e(z4M>tt4BYbvXkphcW$!VTdw8F`5p z3a*mSwQ_0pAN+RS-!$Q~mL6(`u}M{|nLNOpj_ZeCODA&BQz-kuo)7qCXvlpyLEJV7 z`e#9?6hjy8PCO;a<8E&fsH`BA#}h4`TX+(a)WjzAkV9u|29pJs+2m_5oVYP2$IW#(6Fr zu^q7bvHBnHD>H^fnU0+jqdB{_`au|)aA&q~;TD%;%}>4Hp_uSeqGiDN@+}{&Qgf}p z8D)Fiq819v{85-)Bg<&KQV#iFFu_UWe+Zs=dc>YoG6)@@0qp-9Um*mWT5t z-LBmo^*9@osf|uyX0fXK$giQ6^mvuXem<$HAnuZV<{a<jhw;h*XDIh`=Q+smIPG_?;4>d(aE{80{Xa0UDbZ`t00K8EgZN&{GU>{`t;Sqy;TbfX?- z&tAWnVu{oFv~)z|MEod5d+I8~zNm}YpzQIvt?yNB^T}lz3h~9K$n7`zyO00@2;NCv4{d8he!E4xLr^qg;%#KP!?{iT5F7o+eGS&h1me=2SVyDHc0Z6dJqt za7LBM&&uQ2ymv77Ib%lS>`1jWkvrP53Qr2Ni zEo8_Rn@HH2-30R;H$n?$fy6u8(6wUI!4&YOSXXoT zMoe+N4(dPou%5 zf3b)I@J$+4Kc77aMtAnt_;r1LDd*Ja|L8Y5IzZ0I7 zifQB}s#TH+aJ%~h*oct|*m_@Xe9qO}qL}sJkY9;CNMyK;)|HtS>Mo0UbkBt4(dAuM z8ns3KPPZdisN9xad_soiO}tmfpi$Laj(W022{;d5xXOHam#PEEUdjw~+dBHJYgJ{( z|6s$fbUL(M<0dX!L32{@dTSeT4;2?5Jhphl1xd%tGsYie-myuMnB+Mm<(FNv-BfyX>1}vF&Q21+ZbBvoaPxEAe0yv-A7DcLTbIr^2atMD5&u}@ ztm>W8K9(OK9Q$8hluzW40$aV`(^Tm4=w9Q8Yx19x4 zU;7Q8=d}KJ_Cs?0zRjEO(ak$PDx6H)hX?GseOIZf&--jJANGbV5kOh*?~k6eA~6v~ z5@^gUQn^+qqum87H{lGG9-{Azb0zI)_8)%QD7_(tG3|>?iD+TTWs_? zW;3)71m7X`j~ImY(R{&Wx z!AyitRG=sqx<#B0cjoI#<1EwyjBr9b`87;K)zL<~nA|rdDe4J^LN|DTAHjN7I;+at z*MF5-eMG&YLW04!+mYc=kNv+c zvZq>{@FsfkQm6QL$f@v!ty_QnD_QE=dFK?aa>$k>5n(`^5e$dt(?Y_AEyix@WnK z&XD1LTAyVY#2%0sgYV-Ox3l#p5OHYjcMxq&V z@6pvd?p~iu7*WjjJLe>@Zw_e+u3s$SDhFHY@WGT}N1-yd1FL0C5vT#Q4%k)lAyj(m zN1#*dDwT5mpPzvQ6{1z1;4|^_R~>}QXyWi#w+a&=qcNIfmDl&HzIaW&6d1o6!VmPt zhr*od{ijs2ZN^yWaY(OI{sfhzIF=pG2{y&Mr0kaUdcOCvungQ7F(7G+1u7E8ls1Lx z5zWtlJ`VeKbcyaV=E**U%7ig}P+xdi|t$qUdvI9)>h6xa4KTrS8%?ij4nz5WS6?fD2J+JrR@pg5LKD$7c# z;6gwkk`_Vr$hxXi=^O5i8V9r>&E=ZJxX4J#Bt(@Chk41>Yr6zrxKM@P7{jn{ zOYgQs6=g6g9tRc+Tv2QaNYcQ=V{yDymsFV28EJv4Y0^FP z5e#hDU0H8D)UKzh(4bERr3@qs$6Dokq@k+i#cqYa3k(PP+n;~-!{P!M7|L6!E@QZ~ ze%UwwxyO}45QgthLWzOYFM<=S1zZ#U3Q_}IyJQEpM<9PJ3U7!O;qrf= z(<}1SZS*kIPgI#`31Q2>kK`DZcX++z%q~x{%CKlv)PByG=)fiPVT?pLtQc_G7{SMF zp`?HFpF}l?wZyc8+@LFK2vztPj2gzbozc^m5m8WfI194hR$2@?VEc#TaQ$z}8_x1`*P#rA zWBqxwAoyKSjNAi1WCi*fYF`e1KQd*`T7Wl6W&Q=P*PO+I%j5E?hU)uPjs5@N%z;z| z6&B#-D6|5c4S5PWa4bVT!lH?1EWHlvCZ~~Ipw%tq&|LV&>yvKxA4^**L|^l3ukpT6 zlkDFzmY~G@&ogTX7*^-IM=@`1Pv%?KDxa$A*0oLT;E)j=AtI{P?grnUB8dNrCUQ?= zC;_j8-saV4b^>F8t3x}M(&>eI%ujUpeJ7#r@Y|QG?pwXfi48aGoPqm*2jhdWUSJ9Z z6aACB@6+-sxRW&U2u|^09fxO2-Lhi^`v+N<{RAjnrONEjQ&}*o{5|(2bOjn$Hsa~kvhM*yZqn)Z-V4GZi?uj8ns4K zzE!Ge#QKAOsVH{R{NhkW#n3rwEK|Cm~U z>klALeK-wg%6?mV+9tWqWh1#4&I`y!fG`2HGvX*hx2PrpQipX~juey8hI?vCLV&_;N*=T$K8W>n-=h$bJSAQ}<{zt`wC&bW}8*qdv z0vkQu`ws!G^cJG}=JnN=Vy1wN=LCC8vYzZ@^YaF#@rD2a~UkLMKo6F|2_Z zmjof~nVc8cTo+-p08EL8Qd`_2PPWOGbYZvcD>_nC)WD!{x(o?a#@`f&x3rC#%cv^w zF5vsclCIKBy(B?L|6nKcMqhv9XpK>2C`5K)la6KgW)`sWVi!)g82FL~5`>ekR#r8h zoz8WUR&&`|ds=@Za-syFH;Q0PAgnDMYS}2C%ONWkuynkRg~kI8hMguEeCd9)Qlo35 zx%2Y3AHNrQI4O$Y3^Gzv{hj#R9E%^zcJ?z{Bi;Kr+y9|#kh-G@qgpj{YK=%Wb5Seb zbs*T7T?x|_YPAB(LwwJXIk|Z@jm*KuW60dgI#k-WKI~+?BSV4Ge`QNOZ^2Z=enPf%Yp- zA2Rf*6fEe=kr-RXYXr?SbqQ3ey+0)^z?cW`vr!s(yi2&Xr z-^aEtN{uQ3wO$nA+fX=tBu4S+Q*^rx(#Q!_IxJ&%(_%tK*Be|7h;x|-{h zbN;1*m{ot^FX_$?-v6Y^V_0heTB8gIM*Tr*3dfQ`D)$E2I}4t`3RPtf>~6{iW4Cjc zjEgp47a8@wEmZ3I(QD6j*q;(y$SIz(a7Z5Q1Rq^JS?~LZnddiW! z6o20s&(ME8D^iCW{@}~cT7vWA3{ziMcYY53DaNR{W=Kr0bOe5af;H9>L)Mp{hEV-Y zI_(N<7CsCXHg*GE%cOdRyHY@EbJN{aeC>gL-<2OghlAI^#R?UnnIEfmoI zMgbwn_iZPW0r+QB;`*ssqtve|5+y)xo{uM!FW-;!sov<}asS*xQiSgN{QV4# z-ipGH$KumqOXS=wCJK%%uJgc#7&hDZ4`Mmxq8>Y}*RS7apT9BAXUl#}E3z~g4E@x( zs9+)&cN*6Njd)4XBW}jMi|?_z^z9SS8*S8(y^BI7(n#D;eduz1Jy(oT%CTIqZYLeM zt7VqU2`#%&1U}#a3QGK`Yn_cSh*S&L?+pCQ{ii1K~ev1>F(gk?E+ z4=6(-lS;boR9<@JR?*HE%LyZE4F%zea- zPDs3B!)ftO=VT&O97z5A_Ya#*JtN6oqSD$gL^b+LgG$&(1M%^IQgFN_yV-tQ%tCi9 z31F8giTHPo{2L1QVhgco_Z+$xXfYd}Xt}2Xv^^UQJMkR1S}opdN@dvTE$QdI#|T){ zS+2OQfk5YO<`ZgC#3_Z)gILouQ|fh@3GN6j&HX-HJ3PKP7aopnZsgi=p0KSH|L^Gl z3wkOiTtNHdvZXH2*FRQT2)h3EA}IaRr*%8wMT~xn-}EC-SI--i0=#$Zg*KFtoh?(w zGui3)66QwXccgk)Q77`e*B~j%*TH*aK8$4WQS%;z6XC=^qRNXr$9Gh$#wK0lQP7Gc zz!v}A@u9`7x2XI9nqbR%2665vykx+R)NJ$p)36uVTVIBxWT>IQ|S*f=y`*XeeZ zcHmd~2_^knP*dw)slt#^ACKdo4=~;OAh=T9HkM+4wfpnlCnvkz+O%Jv!mp|)4`(W4`VOFUm0~ow%|7Drk#0*>9rlm>1{mTHt+1_gq!Yo zuf6WSz_fja8ub0#hRskkz!u({-hX2CiuWUk3!f;HbAJBcVK%F$dR_4&mc4K^)`}?g zk_BPP^N)_cM_gaq1ip{C3o7Hz;87;q+V4Jj$602g8Iyo#DuZ>MjA*9pdI8gmQmm8Y z5!t(*tgTNq@`_m?CJSg!TfQ$Xw*GX$Od^xECurDyjMW(diU!}@ z6P&Q8^ZdS|iZou@_XXG?@jK)15LcU$Irm!DIRTz;-6&XzQr*g^%3f$zWU%_+MLc!n z8WCpKp)Dj4inrg|5A?Qgl+uh{lH&|S!Yw$ zAa1CDiRgxt?))FQ+x+OIJ5Tk5lFnnbTPVi@H|e&FC{O(L_DeG->VgiHbCp_{I+TlM zR25l&U%+zkPrZuE=2yMg+la|?E6B^8zfi8jAl@^TApwd{ZY12XB)a;giQj^j+_Q4y zsD#AoRt9i|NUMrPB<*|Hro_n1JQ_^VP6GPy z`VyJ($a0EG_pryv^41r79_9M3v5B*)RpMp#Lb|&IT-vq1RF_&z?d|58c+#4M(iD6& zC;Iz2+AYMZ4-PhEuPxjfmX+kEN+2)J2R?|%PwmAiVSf!Q(gk($&MQ%tcI9k8`zFuQ zA~Lm_pT{FkAkVjLI{1}~U^X=ytmo!e&dXKbLAfa0k_y*obC}4um9`0?cdWsWDK0+U-~7-YW0y3SDhJ(S<)%@YLrLKqMK^SOw0gfS>!T z&@mYx&PuC|W!nGYB#%1s{NtkSAVO#q27Lpon!J;5+9tth@+8=Y>*-Ea7SR1@8N!OO z>(P{^4ON_mdc?CZ2KcFj+85PiRP$&Rb3< zKF+@_BDfkv91O9x0w@nFBXjn+7^f$M?aTJ{4kLk*CRRV z4uxb*y3O`_8ya^W7KPalDm}~oRw{AYq{d(N;QEhSC-}LDpv{0mvXJZ3o3F$1lb-Gf z0}CCws`zb0` z*{4%8|3H*fZDdQ`vdF@_dT&;hI;%|PjE>v=Dis#|MJe==VF;}$vaccY$E7n@eAS83 z@cKEziJ;`Ox;|Ox%vddtK^o7ljMgTl#*wW9D?b)B0$d~u88Kq}QT&}s| z(0WwA*2_Oh%7b{@t67!`PAeH&3lpTjU8Jbw%JQ>r8aBx9QtABp{MkeenfQA7S8A4o zG{YrXk8XiUg6WqJAu+*yp)m#7?U12dmz%L77vb3)@mXunk&Q9?YH2tXJSkcD=M4O> zWMSS@VL`El+2-+_!T z?6QfXzKqLpGE0rFP;z4ru%ugW)&~#9lJEY^eBZDg=aWtJ3ma`~^x<}y+5gAUd55$0 z{tY;ZO;OaYz4soqM-{bNirSm1UAsm?wWz)Mnr)5Rvo=Z9-dojHvqo$}B=7mX|GMJJ zA<4-(&*y&b+py=R)hX2*aVIzw*F@(UrIuoXK@KEC%EdY-_LPR*B3_Tigz!&ENo1-K z7yBM(?FD&W2zuz(MG<-aqsHvy! zn&XVT1&eASSKVXUSPDYRFTI&ydizTVVWZbqNejRHjuYNCZ4e}idCY0$r;j|nRLk?w z;U2K`Fs$_*en3fFsUt;tidzJ9cI?JSk-PQAOcJut8 znPES9bn;Q$_O5UJ^F*53P}(=y5Py8d(P(Yw*s!)loU0e6s_`T<%6R*Y4Irs)Gstj~ z5&fMr;;L;_hWew?Xl`-(^TwYmxE{A1;s04#N6b}(z68%tPJit@rn|phG_v4m3Eq^; z0UPYczA6_X&w%DBy%CSANckc{W|@B*@T>epbmzYh)V7~K*+?XQ=?qIF4JBz03T1o^ zTJ?e0^4cVfoE=+y*fTAxN@{sNxl1B`?}De-ks6=y#R5Ul!#(Tp6(#uYz1ZNuSE+Ug zo-4@W#X8LD7y{=c=~NRNj7Wb<*0%7*Otn%5=42}Y$FOHyPdp_>8iGB*3E40c#;Os9 zju3Y8*f&Z3o}5p-;c!jX4GA3MO5W7m0yOsV0X5X@vndz zjdh&5V}(gQVb>g6R*!viUy?Dm3d-O&L*wv?fG+^d;W&NMeCV-xayEdaY zf97(QKuqikYUqbg^I%ftp)J9Rr9VnHkGw6-aq)4C-a5xgPKmdUd^y~;{*G-SHB=f^ zt!?oaHj5MQJBr_eJJo**t+*{>(c*(RhO~|DUB#l^{f>?D_*O7 z=&HC--m0k9^TZIH*EDT62rgUTDRoHSq`OH=%l5=7Oh^gMfF0ctRv?pfY*vI58O7_t zm&s4_?!)sD1WflvIP>K_(XL$zt~-fOp$kqT^Z1j*%k{&Sg|AbT-o`D*)8}15iO_hR zFDHEk5;7J~9f#b{xfw0vWXRF@3iYn<;%r9TC-`E)qU3^H*P*xt4+EOPWW;V(JP)ty zf5yymTo&Lc1N|Nj%^tHMig7cIuj-PS{yP{UKnSrU{k0duuu@3ySQ_uG^!me_2WTR+ z=gbL_eB(3O0az%WDAl1-AjT$Q_kH-nh*ZCH?~8X}lW?hD*n+ysxMQ?X0aWX1BAcaH zqbaI&#UR3lG~kI7&!H?S%z#HQlNP+RW&?slL=MqHSEUac+pW`|zDD%68mWYIV&U=D zDge!#tSRLk{P_SEI|I}T?w6k>=Fo>hO+-6>(RgAjXUdrxlKVD1SOpyE1k9HU>Zh|C zc$jzWO1=2NygC1SO8F_YP;lc1tI4@0UrLe^4ipT>K^lh>oJom}WvDYav_zOHKtm{> zI9vHAd+c@37AfT5#YcEV*2v#eF=8)ccj`SQg#P2wg$jZUB9uvQ#GQQiIYUyuhBCm} zekU;ECO|#$n}D5sUvb`Ol)ZTVRd2KY3C8DaNk0M~Vs7(@V3?NXjo3U2SAh#F0}%~j zuGi}pjD2+ZYx$&4;e_@i?_rn+O?`c#YBQ(#_&{p6@EH~tjfW;4J>Um5P6Z@zpOXGN z!8d(-kB9zBJZWB_6MSqmAXR$h*MiOMA+yL?dLP$*D<&~zjRcnOk%-P3s<)}j^92uM z2;QKw@au>bI1c}P?h4KH!nTA9j{&R1l?a`l((u`Hm*BR9Eeks-n5bD{vFA)HhakX+o}E zNA6swKkYMR4{(`7)s9{oW3+>Dj|e_RA^mrV3tob2__2)l!#zC|KLGaXS{Nfn(7`%} z$V5b^-vkZ;5|5wUG9@zQA`@y?j8fg$HNdMwrprB6POXn)<{|976O>=K0)>zOJ*!KX zr~kQWi+{UrCJRl@2x7vtKvZ%r8ryTFj$23nvR`^xK=J$4^~&en=Dz-P`Zt1byG>+i z&%AxhRKXkS#q}SPZ9?nc<162hcuMZ49`j>m{R)o#esNw{ulBY4+r2MMSwZ$^<_~fo zA9Mjbj>8YWGQY-Mcx=n>=jI*Q!?;fV+y?ldrHW=G%H)2m@7!QGp#&hIkOj4#c66&# z+~Jj=$0hzvTK9E0;5PccH$vUa8St9z9*b-3_cO7%>N_{*MPFDI$u%|?pLgn<2L;%c zej~l}kKDjrNF}(`3QciJvBoA6Tp5bB*~9oL)!Y7vKD*0Y4aoVGgs|!Dve}b^w^{y^ zYT0W)6uSb5K)j%(AEal=`%*~WIVaZ-!DnkDHBK?}ffYa2VBK>{=V%hYwZg|k*i5nJ z+!o4jBMMzO;W*)7AQj~`>QE{}HNHZ%e7%;mRjA%h@ugF(|HCUh-#gRI)j&Y@g^Z>| z<`B!S%Q@`IW8ZI~EP9m9RT@8!0QQg@(i@R=vjw#=8aLL6*YK!I6!!$Q3RBM=AA_E; z&*|vrs1gmhJ=eN-2&2eUxZ?V^x+!-Ik5GnXbzQVDB=!khDx?mQ)jdVdoEkW{%so3C ze_hl%?9E+#w74a_u#4#6Fg34z>D7^bCP=$xmJys&H0PCwE2Ios1%oSbO=amA8+RS^ zA6zU)%G_}xAS8JQmb-7i=w<|u$ne0VMa28y+y^icO0*o_)q+pafd#H!EsmRI-rM{m zsZ!5;2xjufc#CT9u_Al8ff%naSZWo+xW1io;4LBUm=IiGmRRZ-AFAw~b&|`mK+&;G6?gq?QMLJJ-i~1*BHt5wj0N}=b7F{(GoF$)Ka@77) zsSeV)EqDBJ_1p_*g3BQI}xr^i~8rSIR9f!-D*CJpF(kh4D=dC z`H-JmBc&MXcrVp zD}W0w{}s}lM3C3+=G5vFjGj(Z{=JwGM|tI*$&8zwtJ7$&;hd+h0aKYsyJPmicPdrG zKnSuEQ~)U)y&L*r^$~C>5VfjEk{0ej(_p+K|F!Mk>w4?;Jd#Q3p9Kp<=H@X0>p`OG zx>;fYb<%s_Fgf~3He7C^7}3QKbX|jMwVO8+gkWQ5S!-8wSG?CC8Uy?&08n{St~A(R zmu_wso6Y`%gN2NU_g&%}Sz&=-LP1JW2bANB5^c(fJ#ual8dwrydZ_3C2f`~s5D#L` zzu{WP0qZ4}LSKeMKLtSumY06?n=jO(|57A7(a_PVBys=fahX8UFPE>SCI5?`BRi5hMtXeNYKUj=PZiB zE#FO3xKZYI;@$J0YzaKEA7=#Xe~DmnO8ZcK?_W4doe1l=A8Ffx`VU^`%#{f-ho9nN zh}x)j9$yz;i=F?AK@#Veog__peeMXXTvoWqIbsXM zocBvGMw2oUS(pO-Y^h{LF?soqR#lCRnA4d%w)ehK<#ut~_&13Ol>%aBg2l(E7v|@5 zoC0MiQ(~$6V)x=P>t8IuN3h@Da-~e!KK7BjuVQ`Ddrzl##e~eGNcGlYWm4q{MaCz8 z>c@Uq&UYPc38cZae2QdS%nS3rS|dLr75fsjvji{`;fv_wk@OKWlrrVLh3$W9i~ba(msH8jcK0>a3mAMS*yfDf=KL%<;OmWS*1_y|4Z@c~a@33}rnyS8y(0rqJJIN$T+`nWjM^fw&hPBW>}|&hhPX z7ey$GJdNTr*=UzpWu-4-pDEeGa?n>P!TBHN>B(; zL27$C%b){_2hSiL9rP**xEx=b8sOhEv)_B{6lvUgc^7PRihUOylT%&Cxt;*~%*g1| z{7p`0artkhWGE2k#La8)e3leq^t6Q$U(JM0=1G3LPXpbfHtm4CJC>ig``r71sdV@V z-W)F&<6@|GT}}RTEIs%}u&$?TXxHEgH0N4*=)RiGjnlK$_1;lx3&-JIlxnIs!E3xs zzwW)k3Bo5{hVSQv29Os6+Pf^c&afw{1McE|&Et>_?K4KuU)%=;TkBmJd<(Kk0T%-} z{e#e}GqLT!k7(#L>MjT)CTGh^(vIq$OWQ{dr%NE34n;f+vF(r%?K9hjh4zTMyFrZ5uVVSsW+&ZCzd$$vr-h~B!)YnSjx`hKdsl`e038+=OAA|v> zVOwBDS=mJ%qK1vb%_rNBXs0O&UYDx#&I78zHI$Me_enJQ$id4K?94uCgBRrX_i zec4}lAb=U4`31R!11G z3;WF6<~VrOjqLhi+6Y5&mf0`Na-;_@dEt;%P+{UV=OS)lP*+M zRCt|m(yOD^;oxI5vo~2QkZ$VUe&WB>&lYms|Ac2s=ob#~M$%lTh#(-Du3Ki;N+u!M z4RI>cU2f5ImtanHz2vW*Tp2OK(+wyz1LQ9_-lpb~p*u0xT2L3twaZ~(CWqHbx{OmZ zb>AXqLHgR^&Ji567=j`8PG~jFqF2g5A=td>TeY74frrQ&yTn-d7w%TQ2x;R>6oCBm z?L}z z6o$!x#_Q>TEAQ2dvk6|HTcR#ev!TyjMWP5 zGxGmdijc38k4`i)F?V2)(E5+tdNyxx5Ga&<`3v~e(N*7uxqCXJT25S)co`Q1R^Z<8 zLYSoy8;Gf9`o&*9^W{^=CB=eoXy{x##?tg97@xLirzSqYEyfx^9@7p@CMFnUeW#;J zz$uL}4=Ob8R$J?Pa$1A5i%&?CTCc|UaRSx=@I;AK<~2S5!=cXkb8Ad6E}Famu7>l- zo6vaRG~d>L4|%i8f=EI*wRJ*9w0=zoZZ%9JFVHx^co=*Fs~fm)m|u9=VMBxHMd&C( zL3sqs5CE{Dyijlu<8XZLACvs{7|nz!1a;$KGNMVZ94|a$z#c3|uE*$MtSMdhPNB-)q&H+Y zM6ExMr(>jhwU=;#{9iIi#5i2hmJBc?DLH#HtiR$DQ3>9i16nVOe(A9fgyOHNPSA20(v;QGP{lv@oq7AyKHJ6tkNsS{W6*VkK-4r*aqrQ~Y@Mpuwwjri zov5~RWr*j@Sfb-f!oejUpPr+Ry*2OiS*kQtrH_X9C9Bsd9X(NoO# zBU^K!deABQVAtgH!`iJzrS&Bl!v7+V?6uVRMQwiYU|QtsYH+?93q>e zJFivpUe6E!3XZBEknQK8O0Vqp2O;`)~I5fKL`AsskWMWc{Yg{O2%L?HJjx`6qB4 z>nBHHDH7cgRj9^YHZnx%()_!b3lgr6lw-{#-IQ`5{ z+Pc|)v>f+eA>wD-higgJn86ad7XB~CV{B{<7xb8CR|)|M3r;tV@JOhMip+2-NF$YX z`z)s!(?)3g*w(Xpi|D?e;e zdUTt67G?V}B4fpBMp+pmlVe+t@4rP0&-k8y{R&D3d`gfouQoB|J#_2ZOID>X!eua$ z(E*_1?Ah&tcsQzdlC5tWTQsFV8)~3}5Kw616E#|=2wB2^Lab0-w6yy=NMQJt3M&y+ z(%O>2Ex>8d?$Cr;4!4Q~BaUhD$r;_ddH>2ls41X&Tj;yF}tZOukoi62xYu>oPF6v2r;Y`wn9i8P>1evH@e<10w%w+D?lAToz!?UImjwa&Oh0PAF6 zjm(;J@f-OgJacron0cq;~`2%@cu%Z(KqG3 zmuu1a37MsHJ|F11DsiSQKBu|9O7-Poxtqj0WocWq1b7rOb1@aCNTtRs^fM-&GVo$T zkmUN?sB{+nfQktp84gGB8kg;3Ld&JXeJWqH(GJR<_~SrHw~=6}@gdsCJ(zE&?kZ-d}fWsK}zl@t8^mVEOTc>#HFEO>96F$|C}mfy~ULpcmKkr=%K+1Iw|lq zg2T499#<2lxHAr#pNq?^~x)(5P5_3s@auQG- z?QASu!0dnlkScfimTB**2sWPk6$fj;pR24c2bvMg5Cm6;H!EGIgm=fJ1fRs-MBOVu z%cDNLw0*8JGiR14E9em`8W?U7RYz+GiUl?w z>F|7cv1*z0V(V+K=Nv_mW$#ZAlto4T>OL295>zYp5t_&EHp1cT1wS16xE#Q*OFxA_ z`T{Bp&Uuplt%2t^UBCT6%qUg;T#FF=7eY4e>Pj28X?2yo2-AFyR8`p(fP5MswfcRv zGgqzidv(Xqo>XA|Rok6S<8JJYQb20M+?M+eTe5L|r-Ae~XEUC&zd<`+K&Lzw;LSYQio!04UUmSDcvkC>b`;v|>n zJdf5ta;KtO$}s=j4Y8BdI1Adn@2b= zzD}PP$EWS=|MR;O6?OhVsgb#CG?cc;Lw!BsK#WkB;kEySh}?ckqj!g*&QwxdWfNB0 zb1!SDA(*i!y+=H&Nw+cIxYJbmrWIbLHR~+(z+gB?W4cvk|AAV0i}}vD!lxH`Vl)3V zwz05<F3k;rmaZP1`MPUEu^@2Gm(gZX%Bmk%UsQ6E3=KzIxO_YpSH&n&H8`*yu2U@+ zk74b?Tb}WGof<_k4H&EqXFYIPvT-Lo^infbdK6?HC!kCJ5$59gnY-QZ)2kna+v1~N z+$93@-HfXZ`s2Ngx@jLeMe{Z`KH-q6Y2Qh7f3`)WRa zyDBl9joj5r@Jg2-lCJZ@&sa`^Na0W{MAaa*3CP*nx2Uy|khD>nIWX>g`};i(p6kv? z+`H|5dE)y))2)){&WS2)MC~;(5XR&WW@9;t?DXA?n|d*@l)9fx5k8R*ABEWT#+`hB zU?>K1)+V+}Y0%E&dmjg4%anZ><{Y`@q?_8irE&L+>hr6LDYI~{_nM{y{Kg8M+h_L9 zFWFD=xu$udX+Ns{*Y_$*I3v5^uf18tIF%+x1|cFITwFYt??Q6+yhG(^I~L0L+R|c zS4gbK`|lFa-QA?ONPDnVRhE|@*;$@Lx37_8#J!hpW-QyVlp}9Zw?2FKCyH8)=n_&n z_(%rQ#9!~@_wvqAwdk_2Z6bm`M$~sPXA)iZ3|1Z7Kh&HLC(mD{$GVdHeyS>BY{Zs< zIqR#4>bGnLPh(4CK4~5WzUlEA{`PbERSIVvBueCry#8$|lia(4nC7A0m!hi8Ms!2= zB$Jr6-ur=fmW$1JpA$T$d*mUkdx~aFec}U_M@@t zO&1>cVH6vY*R)4j|FD?vp|0$iwDq&TF8|ldIc^hDU5s|Cx1Y{N3bGvsozqIgWX_%p z=(;CMr4&QpQ$^^&o3fu4q}vv+`2Nx6kdE6c(eE0E@&{6E`P-f)3_p7VWrl(W6 z)B1hVC0{+R+~svYD5eMvoAW=He^_28Ta54C&e7D`FOphennh575^+;^zZQMA{HaU* zlPJ$}XHHF|Av?Zo1nVGAk59SmaY3GfL&Lb0U1Q(c8s4+BQVDyWG9GMkXQ$ADI<2ok z?9|%3Y-Z}pHk;Vin>*Y#&)PO`Aui)go#*WDACeEIAWROL{#)Me9w2V#sr?swQ95gP zxBS?g02Q38nD6vbatFgrS<-t5#%%5ptnM;<4LjQ7IE0ZbnCW}WL70jVM<-;25 z0+VmNhlaUDhfF1f7V0d&1+mV5SYYP%)7zGBFcv@Zfv|xL^+RfxShbmcvyw*3mN_M6 zWhCiFG`0S5po+ycuXZ*LI%}^CQ*~8p87Z49Win!vMvDo87Vf8+ z15q}%Ruc>{3BC}qQK`syFZo+9Lb0Y#MH)|&@fh-K-&~(?Th5%umh?5CtI88$+(UdX z0#`|SgrsWz`?H=1b@2CflBi%#B7QUGVUYNH21QNh;?viZW%b2a<0|Ilz7Ib(0OeF2 z`nBUJU!)_&Tik-zQYa5^;UaDZku--E-gJ7KaA5u{9l^Wv-)E=99F^POLcVn3&p!u8 zC8FETpP$f$S5D{T#-cI z>@tjWmVIZV>s~DaauR!RVX--B@pZ~R+KS!fchV2+o7lLol&39}ZHVl(?x~M?4dhjz zMubA$I2-Pd51GH*rYvhg0?m5%`C7b7sf!X!-@E99ep*{f;nZ>1K3wwa6Et`^;UFCf zrh%DZtnbjr0$Hxl*vnUPt`cxN-hs1yFD76jQ4ovG6v2({xGRF)iA=(fDVlW;AZ;~u z`(Mceic=c%$K6Y;?-##L)#}rDE4cDSsWC?`KG(}du-Cmw`<{!S+|St&$`n@{{_UfI zhfq`|>BQZMU@BLvyb@^$wfJjYj+fiJ`nTW<`Kyu4jk=u-%CJqVZfssejTc3%qarez z_PmRi$ap~0mh8qI4gaO#k&CO3SN=LB`SAaJ%qmon&m+6SpN! zX;?7eVK6>craCAang~PHD}F1S(T@r~OU{F$ zoI)@HC(PO3RO?5}oir`{I+;$FpVd13#p(&uzhhr~w4rnUIOheln}B8vgiPVe-0L^* z8KON%n*v+>HEa?|RAEFwGP0y!RPLYADYe7Vs+SYe8n+VHOu0M4YzS;zulF<-4D38w z{6WU$3|%h*O+=#%?R#6Ge09}1f6HNGAAx%ub5sFsAzYb20gQn-doVqJ3LW@Iw{sRq z4^vzxIr&^y;sG9|LmL;T>F|J9e1S&b7 zOHk;gr(l&J281C?G@zNeFw2~W$s1Jd26)er>~=;r{Pzw4ISi+xflYpv!)p8P1J@YZ?k zVla-)K$O*bq;~a151eO};pxq=oNr-UN>zkLg^X}3gF9GWRvzKRMUg_kT_g>{ai#}d z{L^&Z2Wd{2Z>WOY&z2rxn9{F8E`rGQFO~2MKqMy24KEvs%cP&HLFO>}P|Vo57ax|6 zu=`qj+Tvx-?PQ86r!z@slRn5s9&1yGZ~A1RIx@O*?EUf z2*n%zTx}B7h}f5^HaJF`VCHHJ)0A{Nguo&(9jh;4M2ep>i{&WGpBD!oowKY5(;j;2 zot2v09De@x7^^^gl)rbE+b{T{g1!8@{r$Wg@RyO^=u8xzp`K2YX=uwoyI{`ycWYIh zbRRAl;ljR=#ibD#vv?c=7|uvHmvh09^3minw#Wat~sHWYlr zctaXA#RZo#9Jytzu9VsQ18he&h**2Sz3?6fF|7oW z%jCqZenl(&`&eCbj_#gaVXaBiX%vdVSeo1~K9uQwcB29>?@u@!55Z6c;g0^ZzU0oS zO-D-Tl)Q1RnX%~_QP6ugjQYy*a2?xQydi&!nTX)p4!>R;naDS5509g7F}qDPfRS5S zR9grVza6DE3h|&2L;08<+7?FnYt2NNfIrhqhpnHFhsuOZ*ih*_JpUJ%GfjX0oo!{_ z@K|`Zlbs+b8LMZD7eub`^`ge_KIp$Pn^A!nTn|68AYkc#?*hDA8_V4=(vo@r zY^?@f1*V_{e+>SHKKv(fU)7e9(sjYwg@IA(0>w;o zAuk*_01ZIild@Yi-)exme=zzItAM}i|HZx+?K79>EMt{|z*~hvYTs|maNoZT7V0D? z$Fgvx*PF8Ec`&6ku8oV27;0zH;#~rIVc_$F{=zGK4-3s75YQ6U41jTmzv=x-MGvA#Z zyc;~^%Z%m&mqo-`R*DW14z?A>A>?goUK_7f9-zyDqT(J)8w5iWG3SVByKH{r<3@Oo z9tjub#&qY2@?#$iL=?{ww`R~0%6&JxYo-`Oc1PSYzaghRNgg!IR-+H3-hz5jz`EYI zX=I*QutZ3&IG_^7)4w&StC!q7_lhvHMb(Apes(7rFHt3yzOcQi&bIxae5nK zjFT;-Uph8gED#qPz0BvEVqTRrJn}@R#*Z1yS0?-0BcQ=iUuZ5(3JH|I&dDF0Y|1nOlj z&qg7CAl;t+<3eh2-D^vb^z@Cq0n!w&K zH;LiDqEq!S{P-9e9uoePl}&mGy`*^#O~(2Zemp0|@12@bJ_f6Yh7Z!i>|A*^{~T#(n*=QzgE|VAC7sIz$dP+ z9S;u}m8iAVm!3|wbV0SJTQ9v7p@WeOT5zupc;x68PU(OPy@{n4sXtzX{TfKq^BxL3 zvEfS6z8kw{dCqVja?^saj~}?hHlJrjLBAm)-ypd(dgXxDmE?UmkhS48*1;pb{V4ejMdG z&88j6l(Kd&s*2&zLklBoVz-#}aILx=v)C--@!5=tfxiu7? z-!B^6npF;JPbjmIR7vo|Yk731OksB2J**t@iexSUWxT%lwuG&OmK^pzEM>aOoC5@8 z4v)+c6(=2jnUS10tx;>@tbi$%5o4g_%pA-NI%BDXl=@HgCu=6OZX7&Yc&C#7CUh%u$F8HYbZJ3~hI}Xc}!d5@J`@0ggkk2T-H0H{4 zXg^yaBrb{;M&dDfln61Wz$ni$G7*g9$iUnL#oKp}2C`t=zea;tL~$(fYKRX5M&(M} zUZH^Vy^f_Nlt}g$yHhWuk5Y7iJ3)OPn6^Vy2SzSzt-j4Gh=$tS8>-c2Q+mM(I1{js*34Hh2?SH*c#>Eo zf5WSDyr6;2u{&$iw}h<_z>83NK|X}a8d5O;IA;%Q(EltrWZp*R#Z|j&42lIHs9Rcl z0rV3hFl(>(f#D~stP#iw?st&ad;dv7Z>Clf%aTj_?S?y{b&`(9);P88|IO;Xp8&>p z9e@s_=~ZjB+3X6|*Y=Ak{^NTVkTO^%2!(h1Q_7Y|7AGEJ5VG_b`~b%^f&d^MsQ(*( z^CJ61Pyk)IHwM1yaCHuTQ<_Y;S9y2J5K0N(9B5=|54lO!)3X&yXrue-KHiNBh zs$6lNJP2AFQBK4nsV>6PsY2#=F|n+ABJY=a^?;G}HPL%d3a$K^x>}gnI+5tYh_aG5 zeyiT(rNjlBAx*5zkQ(&5Irc+qFnPqC0-V~_ZFXb4SV8#&%>|iQ1=rxX(1D}kLA0<& zz_UkP=Us&HI}2fH#nq*p`e_(bWAwZFSCE^upya-4+x}NHkqC}{FU^%-YO!xlZp}|_93!{aw9wR)&6x*d z;u(z8sjBwN$FiSPf|N}hTU>{qE&I5xpeCvH4ZwjgD3wfi?^F(@UG_6aA;5WMCONp-ymqx|A?LpoQtt3Ds1(ZjVY*aau zNcdfcxYLkPmT(z@(U|?I`YA~m?Xz(xxgzdjNYgP6jhG)W!-^$V{dr&#heYzj&#meY zANe4WIJ>x^rHDDP50NinH756L>oiC#zUX*h0Ly|zSfa^d7uGQAN}dEwh^Dp6jdnuD z&wz2nBZzEk4$u;_1%WYWi*(m4Fr<3yg+SLU7JFFs%JugyfM5`}y4wr>WbBhgUVIip zkewWMXYwq^?)BbTB(lT(H$6aIt!))PpMd=3X8K^qifirq_{r1JZ5BT0eFMvASie)E z{E-|q=X*Ik8S~*0*i@a4CYi4+k)}N~c?J=-S!3Y)wjH-Poii59+pLbO~;Sg?SH0OTP`B%W-^=j zAxi&H`FCNot#4j?M+Q@Qqh}c4R}tiil6dyQSgO|H*)X$DPKrY=AIMzw@_!rs55@2% zkR6G>M|XEnVtcna0wuF#RF*IAdh^MgR-TVqF`HyogG2D;m?5-4vMG#tBPm+^=HTr+ z4s@;56N0U&(R$>L*V`8kaqqe=)9BqtA`0h=a%IG4L zLRu=hL>rGkA$~gRU4@%Y2~zF^1e{0Kk`!Hq*@n=1*EfK9`hvD9LmrJBmQN5ZvmspMd<~esYp>J%KtAYTH2MQ-=`1enc%(MY zD)jsOf0zl+I-B=m5w4R#>A55RcjJ6Qc!1Y#XKf+OX77&dC9VgW5fE8qh@q4mddRSA z;BdTS!@Jvt;(6NG7$;+7#Wa@qgoA0lLQt+{@lNUOcJ^hF^nHF<7of=Ugp{3Ebf`#3 zYejvbL6xOx{fF;~0|$`<*rPYIDuQF$IY%rfE>>PCJ#8%n$Vf3+n>P{zR8A3x)^{ z(%m{u(M=#|P!n;sttcZ0fnXr*r>d{?%6G=01i=0lO*t)3tLJ%Z&HEIUgS*ID`Beto za1h{&@r|=&8>w&){+Ro?Za=A+q5vEYItFLaeal9&44(wYzHI{5U4Hr0o--knL#pw( zHr5M79yV#y-rwyjtvH;O0#k**Ng)l-L!-?uXZN7AH$da@@X7gosI^53ST({pf>aY4 zh{2Mcz{+Zr#+J5Biw0<}g?^tZ;6=lv?D_#+&gG_E-uxlX}|3tz& z`q#$*DA#@ou@)90(Qx2ty+AMyRvt#FoB3>w0M`^PbiVH#*KT%g5FX>EnIF&9-hvMf zo}1Bl%mCl31Q;;7_wK`?HtZ1oV2F&=Qvo@MWa1_taB|`9P5t<2OcHKOt&Y$G0?@*$ zf7O-tN8%pgz4px7od8{cW$41K?T&k9p9uX@(w-UyKU1BF!yAeLz zE4Sw)w?|nVuKkScT35i9XivF!Pn%p!L(;51$zz?^ms?1G2h)#{JY8ArQ0e?qFFbif zjD2G`?&V<_AzP1ElvHkARw2KutT{UK74!~oQ#B*zCN$Pqf8d|TdrNk*z=d;Xb*57S zGv33=iMfJWkjs9^jlG+q+3Nl{l(_YQC72sDL0wRj>QBpH@(Y6Fc)GMmB&G~SDQrrWR^v-TnCVVDS#QZz9;<@|jjX7>q$+4#(|=eWyG z^`IIQP@#ZiB1j&1Z=%PevX^?7A0!L-6}+6h;?w6zdeJxAHooT0`(1^ps}@{#h;}*8 z+CBA%Lp4zm-iFtZG~EkU&Qq@jN1(ktbq6gJ`kqCweS;s|i((AgP7vG*42N5O9wu24 zm5ES=X)Jrvmd+v7)@1x84U(1J$L103227T8EXmip(qkoeMX z+codOr$ITyevsB_4eWWbjoCY4(*BpsdWI^_ZPR_qE>4misnN_)QHET$gxUDx5-;8s zLLsfVc^nU#P-U9EnZXmN?G-aMLf%|2m`+z zF>jhT{TCe6l?Je%QO5`6U{v-Uj}e zEZ%v-CxhG_iD0_zH~z8d=Wj0D?fFX}K4@umk0t!m0>{-ZHJ?i-n?k&FW-+4yLbd+M}mR4?>VNV^Wf3WrBB37x`Vs8I~<$dG)Da~C4*^{?#ajP zr_GjM(TOC}cxMNHgtC%je$AD_HG}hP-}9t3Wt2if!D{u~znZ~}4xih!KAn~mIf&s4 z|HtR^Z3y>;@H_3pB2K;;^yfXTCY8aL$9Cd67M$FJ%FocO>yg@yYf=+3XMmnXD29Xd15v9A44rv%uQczMl1p$?Aq$cvG zQ;>!s-8Fh+d(ZocFWYsU?QG|;=XXE%-QGV)J5fRVj8kcMu*1I3jjzGD>y>&#;isfK z@`0JFFMNiDg9B7C@*cvb+;IT|3^QKZ+}wMHwgqh}@`ve>W zx7cFwB?VKP#p#W%1joGeS#2_db8+aiI3$l-!Gv;hQP+T`g`Cj0lknT%>*PMkd!KCt zq-i_xRtniu?mkBf&-5L1 z%m~dpuYWq!zw9Nax5unUA9|Ec)7U`Y>Plp7W!K=*mlKrABP>?bgJzsHgbxgMQ>4a> zO#+ER+$%(1|B8@SCxd(yfAa*xR$TdXDBj-tlptP!cad0sI zR6o42y{Ytu!->{OHTbDfTYJ6BFY&ZZ1hS2KHPc`&gj22K9->77vj@z@OGlq+crZ^7rUM(%p;4X-)q zsQ!|$28kFFY&{Ptt{6+y)cWuICb*D`s&GugRhkgN`J5a_ZK`pU5aY7AYo}!L@r=|> zLcE@ly`hw26@9k#=2X`|ES~sdyHNl1ufQR^Gny~q5*)dMsTZ%VRo1o@-zIk|6gT@A zBp5D#qI&n@4NE|Vx_fQrcx)!$tw)Hd%gdcg!ktoY392`rD_(#i?AMj`A(`wo>Ih|nDp z@BZZ=SKoQAC2p9o=M?y~xa(2v<9}s-3NMOh+v*zmI;c_Bl@L4FO2Gb0(^@Mk-alG3 zHBQn49d#7Nxs7)mho*cMcAWVGg;}qfaDD<-G*KUa&0g~Tu7i6zV}5YgX=kVJN%YZ1 zB;{@Es&7BrP=6p_;XIxE-yBPNb2zK`y-DL5M0orBHzFGNk z?}bVFv$nhK>R|YbX+Ih*iGPRcr4#Gc7iYJW8`FBRZW%Zx`p_6=uHj&dKgP2WLo})P zITHVI1dm3>zNIVUNsgkLjmkxT8S@>1R?tk8f@YqFN^+gKiy4^ETBeY+NX1Qc9Dko9 z_Z(_o&==Axwe(d@%*L)i}xqGIlr;|A4I&!WY;R}U^4%LLUXoka$3K4-N9 zGLww-@Bl`$9#UUi0g>SwrjeY3SeFl~BiAzQ%#IpURss+i;RA_1;^@C&DXEY)89UNh zV<@>y#L_>yJx<0d;NM~AML*X%jz6dCk*~sq6LvP~ymjP=<%1N2seG!I85tO@8}Z5& z>rlt>)ph+41LMm=>iQ+ti4OxE^uDHN!*(*A%J$@$-;XM1^v`_P9%fSL4*th_)qnKT zt#P~e@{C9xK2SMtI(0%WausK0QL_7N1;03e^Nr-cmRy%c(}z4_`|b}eql272KWW2% z8)-Lzp7b!)h~8m`f&%IT0Zx0`G%7HOF4}oCBxhqfK4}6j7T8-EN4fg)Zw8)r?;|_~ zvs^6NL3L5tVQD2bFNWCbz3~hQ(-Nxgd9Tkf4K4rs%N1g#2_y1GIa}*fo?1<|(MA~G zfme0wCvr3cyOKf!W*+i1VeST=TDRu)e8&8Cid#aoLS~O z!|H{3NY0IVI=*JMR*gJH92_H2`w#|o(VDW#VR&M=T2}<}fR4&#Cb~mt*1AeJ#*@dw z*|Sp>L)bkcycLU`cj3;#<|SVbOpMsP9*ZAC_n8|nOh&{Awdp_(6<)){Drt&!?bD?* zziK?c@%zncV*gXwHoV;MOm$AUNuC-vPl^kL=XE%+Q`|nNB_-RY%bHJ1Kg%3)zxcV1 zW0B^1gL?hl_QMiPXD2@mX=xCd^Z4}*{>`?;3FDj(tZTerf~Ts6@$Ow7Ql$RZ#TC;L zhG(SQboOv)O6um#C)nluZ!^N#-b%2W#ZnO?>Q$Jm;D2(#mQsnw53Vo=^@~EZPdU<}JZVU$yF!{1U>vmSLwM`sz^*qh31=+k(WXq{lM85G?yI3@7!y@-id(^{Nn> zmUMbP<5mV%p?y467e-sQlvX#=EPD|c9?vHJ(kr`Yv;7mVG&~6*#6)%LfTwHE+HfbR z9x;zMx+_Ol|Dh}9qzsyi=L+nu@!?MwwgGBNn~FR#V&Tru%Z_}@ds4|?9)ffb)ZX~_ z(=-U|@}fOfVgceGPD6wM1bXw;z00D4T|5tdB`;jTg=XAsrwgzgOP@ zjFnsYC6CTkoN5!elsJ>C3SxMp@LHp0=s-K6w$3XZ*TS#t?5a3={+;k(3gn&wllcgn z*?FOxqE{!0a`Hzi$~_{*dps^KC+Vh=|52DxRM|#y<_k?n|5rPm0NjxR+dPA9Cff%& zC|ybkj?m`UqcpcALw}7A!A=@bVl}%i93f|aDZjC$2S`}pLw;av^GCY7v!g4^vQT}6>m7wuR zBz!giEIiYUmi6+&f3DLf%@T@4v!U`9!LO_~*Wkp0N) z!GNHOen)j&3SixTSDU>UfQ7iB-@CW(da&!lS*lBJXo4Ik9Qm0-HRkdheO|F^f6MwJ z-v3d$Fwaxsc5$`&vr-QIcU0*x!hTg|_2VC;bHYy1`AdB27qX#$*mutDS3i3E9Wr_1 zb4V0^bNv_M^8!8sNs8qV*Ljk)DLO_(Q^-Y zq#sXDcXa0SV^L(kq01Lp9$Xbw@Rj}|QJVL&;6%-dz0Y*9`NwU#*kBKmj{i2Ulc%X( zCklHs3?p$&K~9KbpiBog<^Mnqci6~zXO1H?@z+dv{Lt>mK4Uz{e9J$r2Ss=?#vDga z_?gggX_0|nt2fjXx&ld}m0pTB>prIraNKV^eu3#}iVo2mI3141W7i3+X!mKzFSQ!rkJ z1)-j^(IidH-^^NJ$3H&!L10XA@`cSSKxP)9@}TEoHvT-JQrc}&`(?1yfX36^7j<1k zNlV?tBQ!GdnksH`RWD|PIPbPX!||=4Ksc!VQmkp?rk<;#QnT0f*m0Hkq|j#yj8bmj9q~Jx!OTg@hRq;xym}__&GR&~fpt0=;&J$XIhWL&?s8_(HbNzD|OyWsNbN;62 z^tRAAZ9MRn>1$b@I791`rQ3dl`6kS*>(oVNSTSem@dp^qWm^Lw2RAM#xF`r^UU_ti zAM|8yfzon&oXf+JqcFSc!S3-2PIR^E6vow1E$8Pv*piiPT@H(Y7VL~a)1%6v=O*s2$hadg~J?dTb*OZ$e%om@%8 z9+<(x9O~Xbh%Bq%@UyN|buw1zW`p`!E~-e-`;5%LiC>=I6!_IrirojEmO+zL`BV5) zf?wVi@z1-~Kqe<0C%pqU8q3Uc^uXMq=xX?}#?P)WNfo4e>})_1?Qv^vu+Te(VP$9)O2xt;UhU1cW$&^)4|Q3Y z<}Fg*WG`yr_JC*BM^1WKPYefUJ@lKo@;T-~WYE5)xw$KI@3AJZI>jtw3JqXi?R<0- zTA7!rPd?2EI;%X#HD1?#R64%*#Nmm`K2G5UZu&p+5p}AAx&ah%5U~P50oYG759!=7 zP5sq#W$Iy>^};sgAw)T4rdq`}r-a33|vrC6DzuVILreS~~jmMD#6N759BJf0$qFl@(?xImmW;38@&ujpjIBM|IQmef86x6Gi=kLD zlT_MM$7%^tUpA}&46|$-M8bs7Dbm==|L5)tuiJ;55Cq|mun>K#qwK)XK|Jt|#u$I) z_7n?(fjgQubY^eNCH<`<8IP);Yo&18&!M{%S=k6;j|R@uP&d=i0x6bQI9lMT-9ek;U6AtqxF3Sb1vdNYlEGR zQm&xO1k{C{!-4y^b5?1tVxR!n(G`=X!m*zp zij3~j{a+rn!EXCID1G0C7@ab+Su8+pNsipr3piz|=+m1C!PW*P?C~~u87uny$ zUM|a&Q~Ufsy0N58q*%m%vVo~bCpR_KsLi--sFL!-VMjBri1Oe=7i@MB*xR^Or{+B9 z&T#8azoEE<+xwk$5UQX(;R8fjh;!B_lRNl0ze_I=EZ?cL}(s?_q!Ti?l6@kTxg1JyR4>dyqb=^p^3XBKG;vL1shWAi;jvBCl8~ z*qpw~m}?8|m5(PG9yy@kbZ3@$)}r@iki=b@$)R60VezI#&o?5eljQxowvKW5P^W$2 z@%CEdhoCuW@=%34ol+$GulUQGlOt`(`2OEl!&?+g6$&eZ;#fwTaV*d_@>9EA{FhJm zRPEh547rUR;t07W=WIO4;~EeiMw>@<_V2E(|5f2TGq(9x8sK|NxTGkfHh~HkKOWQG zfW3(7pI?>Zg$C;RVm}boNZlOQnZY`E*A0URT2*VcZws|x~letI*vaame8`_^=B%qkZiaiCG*cD(Arjc5pXG?5Y;U-NO<7Qm|xqIWL}_qs-&k zx#ownE^ut^Q=e=;lCFeEAcCt=N9AY(`%j_RT9VlCzP1lw~y{fm(XDu=akvji+CBqFwJ>DoI*B9)t_C0{O3|FXc(rbmUs)- z-g41&Rqp~HYZe1lA|k^4bpI2N1NWX#srQ{ILR z&)J`K%M-DN&?kYaM8$r|56anUf2b@pb&`DssDog7ervqN5S-e^PRTVavK=*EKepr3 z7?_PpE=*d*F^W{PeOBLRrCMi5s`Amjhh9r}h&P9XcKX^xBbDC2;+PUj10t>gx7$;b z+l8DNzi0(Jo(C|tU0FYVhf_QX54`=M59(f)Jmx*a6&FUFA;}n27``cIz@OTj65R+> z1^pVc{&HdBoPwR2l;Y&=-sV?xjJifn5=H!r8zN;g*t<;xWdgGmCde&oJ@Kvt)9V5- zso%N;)(<5-4lhS6{&tbYw}1kz1kw{W_0SGz0nPaxi19wY=e<_17}m&O z_Ck2e($MXbMD1nGXI0hp@W@;sbC9kIN0$@A5d-C^y4!a7P1GoyZZe!|&`) z-;l7hvi7Iz&@94@DP{MTjOOtU$?UI82Zz5YKt=Jp+d$74j5r9i^sw2(RDs>iFv62% z%#w{Q3tSDBAyN|8qLj8vxe&Hrfc_j&d6dEgj0we=-i9e1{8X??#*VI_+G9nv6JDfL z$vXi!Z(GSt|B+X+C8yLva`3wtQ8t(NvG5;2-G=Aw06QpPN1m;>?xf%9Ha4p2jKUfqupK8ca2C!D}ZdOyE+JS@(BL{BDr_pdLVQWl3-*sXn%9-+a$sDL9a=$;J4 z`r%o2s~?ai{f&Q4aShNiMLIt~dBD-O#LZizx)RE8ozVvc+hk^-6Cs)KTmVF1Zf|^6 z*#NW4T+^}Xl1Z)8sb%>fv8yGCIflC z)BYrgCPBw?oDwVphe5g_vqS|p@2$}8W)-Po+ph?Q07OX%U{#>${ z`gM&C#|X$Ubz!d=YzP2LtlpXO63-w-lH_^30_h2f^gkVfh+4^(cs*~vVWxedbF`8Q zH9CZ!@uY7Ezf8_2I`Alc_S?6*x>cP*05`R%yg5M$h_~Iw15L|kKids`@LCCgkQQ~g z$Mii87Y$Je4cU*abezrxt`MGpSHeGadqDkSP4edKZ6-MiV6>pC#w*rrS6q?LNzm$_ z1(R`p7zy2_>%#agX?A(#G3u8dS!WGJ(o3d=iwNaNYdDHIW0WsCjm*vBYI#_MWrOPN zDaQ%)26>t-tS!syJrGd?bl=>M^!?d|@z)oQ#zD@5nT(Ihw`C5GH(GxDCtQYo{4j*S zA7Gp4TQ%p9q#fl^pYy`uK7o6U?NdSbTfrFhxf6q5!M>(*!~tH9*&LbyYXFth*@I3( zw^JXSMRMPw=iTmPXzLTn#=!ur_iu3nw4Qn1*>3pV_i$86 zVkj(SQ8cAi`q1*Ll1u&CvX7l+$&-L(XlSPEL7O*h5~fYC$|ioRY?jjW{Rd?v?3J?bvLJPeRf4+hc zw8j0p4Uoeaf<*vXOe{iJz-DnVeT(V`%TrGL@InsMGRKJ}Vk<8Fp7otCPVD5_u2gh!82;n%%qDAGL-Dy=^4YSGT~z?CV<23-8Bso0&)$g>8QaeZ*h?wut> z!x7WXe!_J(ZPip=)-AUB;DmJC3m+s1qn}fK{A^@356cMXv6R7pc!cKp4|b-KAzD_! z-M?`CON^v9QxM3aJkyPd6(U`#3wn5PyF|EBfo0^yAqmH7Rp)7FXvBU6emr-3;3Re* zy9|xsqIZop?B*c-3D!1YcMIGIBmn?0XzTcqp4;hJE0I$iqv>qj^|M`NivL@ZFzh!- zNHT}J{HY#m44hP#H|%fD9+5Xy>vOKrFW1#Z5K+h-0uPXq^&Q5vk6l`VSwStWkLtO9 zH{TQUef&wv3BoXn$;bBuU&V8ot)XVLQmDR7w26jRYD;|t&ECgch`r$(>X}LwUh=%< zfk>NX06Yil>sfYD#7`H9Cqy7M)bhFSSmRZ&)0m$2p-@^ToC4LBcptH`%;H25 z`(Mb7c*F6BscqTI+HXub{lHaQcvZk~yQo;}{xkYWF!m3@u@{?)a&-Sq!SDKt|7dIV zi|R;Xf6WNTRe`qehgue0CTT;GQ%KUS+c7L@NaEG+*AXINF23Z3@dmkvHQwO)412ih zDm4tpoTgmo|0@!+udRL5z<84U54{|Aa5#7nkv)p<>HF(rmH+iNt#|&FOjaQ6OJDeu zvV=h3TR8n1+`H1rpN6m`r1qvh{DEwl2C#&qdT?mdSx?ufsaT&Mb`da4)aF1X>Zl(6Z< zefg_@gaGPReO>U4ZzA2=!0vU=C`E*iMlFDtMBuXmnvoRJ;PdOKhv#$}Adw%P;8lh9 zHKZdGCewJ&5?wj_7pqGrSa0tW0>6u}VOIhUsHpMu@BPt9*2_172&n?#&9cZA8q|&c z0M2LUeUb~(lL%-XAyQtaEc^3h7e&YE8R$NRqxEJ38dii^|Au6&=O$=i?Dp0sM;K$E zj7>;ZpTZV;l{LmEo)e$_8VPv>Rltk*a&hT!>?5GZ%*9_nTwFzj-~*g>bl32ihB;`e zJ?CyU;r#gtzla95{SX2#iug3_5Av+COj*1bmW#7I!xBC~*llm#DJao9%a)mN!`mZ? zn?2H>UgQ$GkVrZ*`>?hikbLE~NxYf=?jAxKwbmTN0|lz11(Fj7w!2Ha4EU?W3hKRV zHJ}m%S-7w)rF#y4JzU}3$cv`i@4vBolASLI^y&^BwK6=M>vxAJj_umSco_4WezQL! z*A;WM7&yk9JH{LJAq(w3w}xZ+ova8pHomzFBxutXPDOd9KPDnpQ|#;T_&DK~BGh?- zPx&n=+{rPp^Y;OBmcup8hjq*3yKMZvFH_HON^5iG8?OH}d8kwn7CnzN4PxQ7pu9ED zcS?K9;Q>NUH~IODSO3QDyR?_!b`^F)c>X@L-lA9rE#P@))V<8b@tR7p3^2VY>~k+9 z;wOHwDT(5{cMW9HGR_==k6$H<>~ytJXwU9SKGiQJ4^a~76}#Rs?=I}b?9{s_gJmJ?(+*Sa-jGT=d_wGJ zAc5(;yy>66wCmMOA}4I%JL+JtP!xCAo}^9VdhK)P4;g^0aeP{9-c?o)p|}tdl-%-j zm-+A6m?<$W(8%MlzP{|;nJcpE;n|c}ax|cre=IvzA)hu2aMhhpqx0?ZQIrjbo-fix z|LV>8sF6%p9X-(W5G`O5EiAifxk-w$rWq#quC&tYJiB3{;isEJai?$_pfY)}xqsV6 zQB1<>l(--ul1IBtCSpcHobOc1MJM9uesxJ0DM9tw(vOPy%-% zf$0jR(Ta%QAAYf)sq3HNy3;2s{Yijl|(@m)A`KMB;9y$BVdzaX4`IJ^qG*u=`qy^YNDI(qi3 zU^ptOD9uFL@@2+<59qX&xKp3j3%$}{tF(M07>5;m;?d1k$Lh-Rk&eU41@nrVnB>)M zSsfH>YdY7+W$)>l!X;FHv7FkHp)6dV%aJTd6v=)saM4#U`<%^{Q|{kOt8r;lhEj*T z`(#19u=@d}ln_vQe9R8*eStOA+?`Up30<}p_m(#q46k%%U2}uj>#O91-U|^*6FwY) zJ>^VwS8y4lQaSEnmXXKr^`zSjk&@Xrjmn8%(c+>wecHn0`qNNntE{E;WOF#dgn!gb zLLq$4w~8O-Mc;huEBO$9r@LMA=D?AT&yCBK#Q2H$%VoA2%?sW$(Qg|P>#XF<5e$Aa z2|bG;DERbtXS#3(`I_*8#B+lO#Zlt$dR-o+FAw%~B{hVkw1M5s#`)sr88ZIb*#5u5 zM_jKJ+qiVDVp54DBwR!W!8&Q;8GluJVuHIyqI~HM9c~o{o~GoP8phqzv8i8mZBXT0 zP4JAo1cozldnf z4|{_W`dMjy6pPy*yX1;_(Fvr`hfDWO)Dp|YCXcD+Me#D+GTcyQf1SR+@`};w>&qx{ zrlLOu$xm`t52f&p1XAtHGuI7iPvVwtuG)sHtj}Zi?3-v(FloK+6(7%E;n453pe9S-&aPb>VCX_ivO=d&q&rOV!7^xwv7a0ihz3U zzj|!tx9u9SpJ)Gt5^Y+t|M0sC#!BDi{$bvmdLMB7vXnyi>%-oPdzw9DR1XJuU>$s# zj}F`VL)6T#--NA*Yxd~~By4E}2gerEu*@xekG_cy(-HdKBg$qBvDB>XR zt?pfdOS`-nR#Qo(j!#mxn)}>DFE;mibEA&WNyTyOl?N9)Gn&FbBKCd{cOMmfS!PrM z^HzGaEI1b&vdpii6$E8&8%y zQXvhC)rD<$;^cilo)XV55U}54Nv`W{{w3XtXVKK~9N*!J^-+Y_q}b;!?W~cI{5$6M z>5DSCT(#6>#)Yg!<5>&RC%w&OZ&%Zk$5MmiLEc~M=WRxf?6cIol8uNLRYtGgJ-Snp zH;{?cc>g1>WnKbaK%&bt)HsInY@?6KdLM&6w|Q*@4e`6jFq{ut_WtSS&OhhGpv$LJ z5b&#)c0Ep7lrTgohq7>l*P!o{Yj|Fdr;I}cH&LMVO8JD%QyOq6X0RFLRWH~-r95aim8q-1dG=T+emJl!?~QI&w~**er=lW z$M-yLswR*5KJ@AH>EVl8a~>aWlrvKGcJ<}xv5%C`{u{L748+Q*v(7$Vbg~zDo^2=j zkjAIBr37}_HWZu?@+CcKedN*CbLr+P@(xvnEK3_9FEJsjXJ=cZZ*C7!$6)gVV!a!Z zg`kwp{4&dGqwj6q)(rP9Bbx8M(E~srJzFovJM(iawYIAHcEP6RL1D76#V5ymhWGdV zLk^vx&LAOl_MasWsNhf9lD^lBOe4BKz_s!xu7vj0%1Ce4yaoa{M0N+^8@kd|zTX6s zFe6&YFxD)50=O>0Oe32B4Kg@i=Q;E5CQb1?$O9)S^3-_gD?vM=pL)=HUnN0$94EVt?<6@nQsorzWP?t3FQnwK7Ya4vo;Xa7%CN{U}m%?m56tv1ayEKENH+mDlujfl4woSd2c;E8p{vppyM9>94)e{~jZ$ z)5AgCuMF6Rc#?g)#iKra<0|{bIx#-;g!IokskfECiaXtQ^OUN`pGLRx1k7@;#oE-_5G;ySOt~$?-N=?s6E;A!GjCp#zk&ezGpb6yRvQ_1KL`&&KrV z&C?4ns=unewJQ!A)9)_7$&2gkVZC^2jyp&WR?PNm{NtuAE0=-9xnmmsa_oY;Z zVnJ35TuuqQec%9&;9w?)Tlb_OrKCvS=1D#$zjdN327J!Wf{po{j;c%`?%e74AK}@; zROaku*8RkRy9Qu>?e^qOr?sdn>2K{bB@YM2Y`Jlq3>>{+djCg)(SPr?dZA#PL(*IV z4AO9fA(pB5^d}@^qrd70d0s&TzQUdyES~9pRZjK?LD$c!ydG>nci8I0Y)xhw&8fx& z8rWSf6l{&1KVqu|J=C4@A8v^Yg)VWO6v*Rr7I#wX>m1TqO|)Ni8%3J;u7ve`G&#Jt z0u`u_YSxXPGQ#r~TNF594#EqYKQyXUQMa6*^gB4;?8HnWN9B_07@nC^sQJWD z+vFlrlH7AOMjluBD0wZH@WR>p%Y&TRSJ?Sk$OBan{1qO%)nS za836#ARv9GPEvbVSmbR8L6|QfaA9y(zw{sBD@fOiZFt7{_rhv7b($_Q0~W{FP#8K9 zYH8>B+Yfd&x!X8sknImp<-eX+GNp@ULKVd*+Pm^+uqzzNyR?@XrUef%(c`~5qRx5r0dGzjHE(%7V2Obt;wIEl()8xQ`oHSm;`PWZ?jELoMEfD82 zP8glsBWcn%7p@HZqm!Y*H5j(Ry9+TLBqyr;r5q3o14kI&%@4^B1q2;#C%WDqU~~c^ zbpFzp+Wm?$j>HMF^ot@0#n^r%>+8Hhf0Ojh+FRU`o+$4mVK&kmF7f}HE%}4W>s0E5 zu&KWQ8)4}bAaF)u1nfOQQ+`o2I=|7{leK(Y60;%90>u$1?3u;9&Xh9}kjSE9(bVj2 zLD>5qH)A6{&a!+!+i{fYq*go1BaxB)ofv!D zdU7$7Dx0MJxzSrT6DqyUx(D4NYdZ(j;Xrk0(WRZMpQiiwv$A%(J8RYf+$CK>Z-H0>f%PB+mW)j)Kwggv z!Q~=D9lCH%2OY@J2pSH*FI83+49AD7D9t~-^xE2j|At}UEqM7xVUi_g>I);ue~FrF zAz=`!ml4YT`Q1E2i|Z2P>>6=0k^KYvUn?aJv0>l>v7XL>s_Bx*ajtz%pLOKL8mmmC zDRKSSqs$G-U#tmaeioV?f6?e+_vHorYT}*H-i={_S;p=?%V;7SnxOw^_kjX(6Ba`- zYpYCXby3g_kR=0eZ#Ej1LY+LG6h<0PD=~F$Tc;?D3p~;`=J|SIx+S+UOIY2o0$Dko zy@aE5%R<(a#=!dUZObLfBH|ISlIpOttg$h7K_n{Nn(DkW7t44;Jh0P1QIS+c zUyiba+cV_qMkaDr0moyC;0Y1>i|G&y6TgMApLTh_Xf9N>dEXvs4HbpAt-PJ6V|S{7 z7{eu0x+SY++9)utHCL-jiwN_G=;8jtHLGv*fv}Mjn9}9hxIE%-nh4Z62um-uTTnR_ zk??*-RaC;kWLxe;Y0J*66el#xA^PLO_*QcX6IRe3_{6Md97tCXpD!9TxEO3+JpkWBruKW8jg;~p7TN6n~F{bjo07f1($U?;+n~NN` zdv}^-_-pw*De`F!Mdr5BXaG;GCo1Lk_zu2pUCcSRDze96L&SA{Xh6pay<=gQ;SHv&~ZQQ(Y7 ze4Cp`7Uk6c`o_Jdo@(KrF`BI1u|0z&*{jA5w4^P^RLl?>mBU#HGI~oVS zA9LOSx}(Ci-&0GPUSDN+AwM)HkU+kQKlj4n|Kc;^YrHU^Q&J9>XjJ{}a*V&gT{gBv zg|Q|mBYrA5)Fi{$nWW#*u|RdDJO%_ZEj<5C)tTWdK@v0zy9)&z%w~LAOk4sxNvuDC z)aAIly(rROCn+%tce4gsT5;l{915OGlL)bp6G?z{{eth*BiaTT!2WVsFtwwdEDl_I zbB)|*B(Ke;iAt5>lE2s*8KQ;^-t2$3LdBMZ|Ic6x3eYCYii0SqP_)rJb+f)!bLYeA z6{Xi}*^C!^sc~_i!j;nhqNO>^?Aviib>o?)ANTCVN<=E+cI(C$Z5+A}B#_*t>!;h4 zvRNK~YvZaune)Qae~^?~WOpCSkxNS*C?DA8iri*H|00zqZw~LO$`0}nhiOHeMyAP? z&$b6d*pw!3V)3XaNNg5);ga~Pb*U0uEW+~Xh5u(bZhhP z4Yswtu$9*D!O_vH6Ye1X-mLSk;iwJB33nn*OK_LkW5=Yp^r!G6->|i76y@2bCEmQ< zkHbsi&Dl6(2f5J98t)D)3Uz!uXzZRJ1rrFq+fBjA=u^ZmjKjX4ET9wC?F;wsqUyu3 zL2b$>Tt*+{tvo3);rkrG-%qmL&HwVx*9VXLk#}g0scR$(D}8yYxE(uzLmwkQ*dYBP z4y~_QsTYoUk*84A$NoC}d9+jP|%vlrv<3sJbk)m0hZy5<5PnM71bSNK|U4Yj~sa!tE*-(GMLr8rYZFGA@>Jr{7>2?=2o;ml{-1bvQA&wb$x@%?h)z;Q} z5)6PXO7tc|!wrGdS2N*i1;_^kOD^aYVyVxAUIW-7MLo`f9m9-hHF}OhzK#~u6$(Rd z8wR~Ymzv7jA7PostdsF&lZj)~&h+_m6P`FdzS4`pem^(w6QY^!(uQSd<694r@!p1) zz?5&gwW$l^Hu)^$BxxEJhx)RSJ>rsJon|!OrOecy5G~O4Dn?z5_ zvZPI1Wd-_|L?_-n?5$k3{}jMzB5(H5s8#NY*8E7}(^~c3r<;eKu&mZYz=5^2>9WVL zIt-6&y4+ponQ%Q~)xXA&z;JvlFaGT<@oy1HP#A}Ah0PkAi>6@HQn?daSt6okMPnm0 zqOYF_Q6KrHI&W^A7dpLTNwv4GW7&@d_&mFUEv%L`&wC^Pu6@KfyJ|l#@%SWX&k*&^ zcIi2rV}^Snh*VI{I&iiB6gG>JW^Pu?OMjyLBu6T&7BLDA6kOo2Qxqomw41P+f>2F zUU`BRPo#r>3Sz>}Z(<|)>F*1sep#|D9L_8mxBiM{eu$%=-)tHZloimSO~#g+I$X|k zpMTySS3KVB&L^k_$q>m|e7?!^uz?Fplg19N$^!8pwV*jemFNZMeCRtb%KcOyFBnCu z>e7)9y|$a~y`NO^vxG+TRcc}JPjjdMOJ-h82!02F5=#J++xH9r>*BU&nT|_vnUwBV}MG< z4e_|TA@JAN{p14Z3fM39Z#FtkK0II8ijJh#O#+P&DTMyiRj?sw1zQk~iTCgRSwdIV zc7a!z!?8L50YTthVA#%R&btKDkTS>v@5-M49oKL;&k0 zaOQmJqp`c((7cmi0M4bSn1Yue99QM)9ue$94;!9(UG6BssR15=!ooRCJzB4FKQ)vT zfOtWLp_{YNvQU-TD#MVEBWMZ}g+!n*(SUd@ZT8&TTX8LJ+I*G!I$^0w0`Wuc}AM?8P(>1E#d1WO^;Z9|U?IyCoAA?R=#97*WbR`ZN6&ZYTgCiS< z!EdMDSRCD1gZ<3PDGtw_@d7T$JP5N~tDu>9gXr)Yw#WEO!0=86-tZ1-^JCtkWJprA zr2Xv+{8l^+1fxb!p_LiG*!NcZ5wX%YvQ`ul>ee>XfR01FH7y&)t2)&y-YKjzPxp<4Sz|6_%^i=xQv1 z()rK{k*yaDR1KDhYIc7a+kW|66Sv)|@Cpze9i8)M8wWgHAa&5~e~@4)u0Xy7#%A{` ze)okRXRz1PMz&co-Y4{=3b^>N)UETvi|Ra-WCv=+GN5c6SxD3_H%NZYpHMk-x51q2 z$)te41d7-~(MPm0-%vcSZ%@BjK_x+Ch#$aG7P^OoRz$p`4`c6QPjQaG;MREc=R`Mm z(eWoK3zMd~|CG1k?EuOm(4?u1lF^LMOg5U_c&4_+|AKW_A0u(`4 zcj5WpxqFOo>Z_WCg6i_*E z{&V%1Uby)ADx1k)b<(`^y)95t(_iISj#U~|K8MXl##@Ne ziP^X{NCcz|#T8xdWHS5neGS}S{I<`cM&S~)#rklju?z5=1jg>brOM`scrSnJOx+B@(Q@&$7nO#90-O2oipV(x`m+-MI z2xvfDAbC5b_EakB8*=vsO^ilMq_ih2DH^Z-HuTrMg_8w+Yl{NO0UMVoUzCu?gPs#+!vEJ*yV_-8Yta)gZ8uJ{$VAJE8L!s!Vmc z6q6oi_2Q|MY%ao05k{29?Y%>JkTNcNXT~*Q+C7robffIwy#8+Ph+`64JQ6_xlHl6n zY3T9T%O{8cv4Uh6F|>R;N|yXui)eat`bEk645+{u6o+&-v$wr2o3|AKLkqdzB{R^; zqg*Q8gEGplR_YOFrKis|fBcC?xEdg6ARMWuQFKEMAN3BwUI!Q&|H3>_hJ&T%2*(ih zBOdfx*Fy711aLG8oHS8M~9D7&s@?#;RXKAq~TTwRFR zPk+#Y84wv{#6_gNcVxLmLOs2EN$Yw4_UT(*zgiDj4>0XuYbC>|?@x&ie`A-crOf&w z)A8TnXA0aBo+t3#ww0nmm~3kE5nMb#%J9ng_gEj^uCX+N~~jX{Y2)NhsN{^9iBt^ zG#Qh+rk|JI`V`6IELWTHTnY}dL7Cb4K$g+P<2PB{Br+OjIMNYP-!i{0R`YDut3L*~ zL=$=%;OV^T66^k&)xQ)VwHX3hr9A;%Gn~AC59VHeVw#yZAa3S;WIIabeg7JKjovp3hJj5QyUybt)^a2LN+Rjo=3gNEf?E<9=sbJ_z|1>}y!Z`+N_>go= z{8gOQE7W`YoEOWZFn4X4$SlV>QTXM?O&-K%l`dCP=%h^) zo92hFac@6eTu4FlZIb>zCo>22`TO#mxc2t+taYst`d~i(agl|-lA_ixNoPAl`k; zXtvk4vDbK}_UrEsAunLqrF6R_xa2@rvS|5)E6Yk8ZGDy8HO(^Xzc2uK8l;ZS6##^dB z+md$1;H&C$y0(aSK2$>yy7mBdO`?Ay4#qe{8F5m4B2QV&c%3)RU91cs=y{Rnv(0zMWc z)xe}c$xYkH4J0{FXVnjgs%)GOW>ufYr4yDykbyLLI2wx~L2oVYP16=L-?IqOIo*GY zdq;&^o!>!P^RgA0`GcRV(ZZ3o>*ZBY?CXNJP%yH=xhK(li)rz2`*V~!4DJ9JV)pR5 zojf9H^O5Ub8Ku z@V;G@!VH{u`e@dF$h@3yJ)f08EaxW^u2J(>y2OJ*fIrlsRHDnZE^EPOc&5DNfMu&I zj{he)NLyCUz(M-Un4@x+iX7ju=jIMNIzBq^iCS&SozNk=;H&(lNEN#3%-qkR?q`$T*{eB6IMNRZos^!QFcE43Y3C(g?0bM1jak48Jg zetil-8v>y;j9sjYNs^v#|K;P_JS*`}e#BcUE5_jg@v4n0@UmB$oXss> zGe{73_(`Ox>^J{+8q{X={Jl_)xUm0k6HQ~+M8kKg>DMI05$YmUx~bx?J!c5JvSVuF zS8yN(-HL0~T~DHL%3lIQy{y2Z;?OU>g5>*TD5=pJg^!X>m&a2;qw@jZaX_`fbqJ9f z3Dc)10*Dl$!zX^MA*%}R?B5a3H-o) z&!n25_BN8<%yb=vPIE?w;en7hR>$4?E z#Ks06)WUt~Bb;swLZXpTe^joO5Qcp6beu*1Esvm%`}!;1=F$(m%5N*n0n9;%b%J7hxvGU%PiVPw zh(C`xue#^+Ic!_NSE3fC@ZLX(vT3jvEVZdT)JtU8g@=;T0-qyfwqzMIHZs>fUeE04 z3R-q+N$;7?>qO7j6WjeXM>J1fnv3bZ`)g+@?CGQcOz_MQQiHjW?1!CB zcupdbTxyeTqP~YLyJUjftxze85y6v5+tKW9kgy54$)m(*@kA;Y1QFc*H$#riqk5>g1+E*Uch@vj;1)S5vBlP3(7-v=fVKZ+RUkgZjk z5R8+)iGq{~{$_l}I?lr-gt~Cz_hMx@v#h}|-#zilZb6fgNy-N}4rUh3#HE$& z6L-9Ue0$Epmj-=II1;Ch!FOeA36F_<>Y0)b^2T4jYtWEwxEqNXy&3%Oby4&Qzdz%I zS@69IXco%VlC$gM$p0W;O0CVxEB38;o2178o5Q_~kOi}wGzjU)qeH=g$&8jMll`|o zYMJec;~elH>L{XBmH9=Xq9}a~$AWNHW7AQxz*PJ^qZF+5UZS_0+rt3edO>6-*%wa8 zc!}sY=YHF)CY_V0Co$as=W&;mC7{_Wyo^E&!!P+M5sepv!f{FUJWNrKw<@<9j~Csx z^xkQVTr&_}cBoTL$Mb#?_YyE52>W!aSosY}NIS^YXnsP>Zb+S2)_P5pm_WbBj;Hi2 z!QC!9A>M*um76*orQ(R$|udYzyBJHA;{uX0^ z{9tZ$RzPLS&s?a+7BB?W>J$C@NGibXMfSZldE~1E$+y0G!0}@>ha#BunYgS`%H?P6 zpe2@smC$+igOKFp&5(od`7;HSS9~?bZ^K`tk0+A6pqH{2)#3R$IKSTc1j8?1*;a9? zhiODmXmPCXe;ht)eDGuPL%uMnj?tIRcY%$?M)!p35zzr^Z6K!%!@)^HiYIKzmsivA zn6dEOnP7)Arn6oIzWaf5-zRiS?a#zPziOULWC$uHjR5(ztc$l+gepW-M-snHb+VAA zm4hR+%6-eZXP3S{F_(FtdFmTE@k7^N1Pg4lV)Sx+J1tB$C|nnN(a}LW-*h*519^r) z=6^aEJ4dpmfN>#>W1-KZOL$D8tSa2V-gVaY8z2EFd}^|9oX-|R2jba7WQE%HhC zj>abSh`fM~qz{)tPb+hqi2PQ;zJARRA)4(vb!(A+oXgEr@YMOm_%(bN*iLjr5$^ns z&R@slMd`pV!(XJ|0s;nBZjXxoI=En-_d}mk`;Dx*V!b&}nsZYO_)~*~-+i4@+6bsv-xfP@v)6}n;RraaWs08hc@D10G(U2R{32lC zBpNOm*`&C?v6Y#w@^^3c8_{Rw9r6W_RgNAqk)Vt)Z2hTLo^e#KUNr8tpCay(~pD71YiFlrlSAB&)@gzWMgTr-ozXam*cOdE+ z?2~d4@yKmevZLHfoFf$>1BaGK(S~0uo2F&aH@P88S*r)(v0o}~XZw1mdev6X{GPwq zsv4>F_9@b6)sjPyjYuS(^8bp zPZb$gRoCA+!Sz~r^9l)__*qniPoJYC!FbipyhGRYhpp7mM#`;OU${A0!1rYqasg}O zN5XBcJotmy`fSy~S7^P>MDoFwTb;~>yeLX||0-?ef(#y``(T;U@K8MIKuO}IIjyWU zUJjTF_L<~@vI!97EKsGGdTl+j46{3%w`uYrm5pmIa(bY}w}DTAC`&xBXq6l7)&dt~ za4pkW4PJVuMlKTf%&21IF0lR4VIw(xP1kTEe{m%ukrl)Zm|+voKfhO&pzoUv?aX7} z0P)dp+>72ga2Q~}P7h&^X2gm?3BPllJcvOo_!WGb@IMU?OM6QN4%g@5Ds3Q*T%|j*lXwEx0To^?CXCuDBQ5~72!&n+R+zR zKh7EhYJZyMB4ajjF4GlooQ$<+8o~;DhA`u>7nHb$I!dYU4K)}nA6~oPBbhT``w9RX z2B;O80!i$~;k28qV>0%{b%E9s(JOps-G{XI>`jh0P%7*o9X=_FsQgEt^&|UMQr|aS z3`adn|AeBc<0CI5MGtJ(Kc$MhcOV!N_!N77?h_ME>l2y{^y7=_IF2{X+MKJN8nsui z_$Rv4rok9~-*Cc(y?eK%uY#UoiGVdCRod%+y^p%e&@(=fvhN&^B-$GjWu(Wi=y|xJz~G;B7?VS-Q9Eqf$A1_aIOV-I{SVG z3pRVdB6iy(<+-ETPul|lLE40?#|P&H(%KaaN*}EiE6?GKVdTn0j2~i8kQSc%Selu* znM#Bvg>~ZW8k$TG(^v_+gD-)SM3uznD-b^%Dp|&mLuXakBb5XHlS|Q(o(B-9O*^*==+V z27)m-epqIRJ@&Z%rz$9pcc%C#j6j*tUI?Sv_R%c_-y-Zb8`?a)G_Qg&Jv^T~Y_i1J z3DOl(zgNXE1`CW~?zSyRpzSyh0v`UNy`aTB~xq$d;#Pg?Jpe7~GOzQ9Libp}8!pCSq6(2LINt1ezu6t^2bK z2K?n3k_?Ltf^7^IH>?uK|NFp{I<&(Pd26>Qg%#m{5;^orXbi}`<_Rc=1dGnK znmdfR?YWVHDtm3$;p`Uz7YV}DO^Fsfc+ag~CTshbZ*YPHVC7<`M7Szc;g~9rDPbz> zFEj=ix4wTXaFov}P1GL++^@mz+!0I^0wnJeTAu!9OU%=~40ikv#9DWYh82Af*K*YF zI%ys3dUAYE9Jbs((=mEarTatm`q9$-*Cy2C3(n0@#|842{90FPRdUKlzwOS95IERE z5eDR8M<&{Szm&W5n~vZsg$S0x*oagguUDX-iC{Pgb-jwT-9(UvX)f(sPy3;I-R^L( zLNnC%E#{dxrV#mH*?}X8lSBH{6N&kjjCm0*FPsf){Jy$eesiI8L7XRlR~@LkfSg}yG{xHRAkaO9DD^o#i=kSOMS9!Med;I5yngqNV0xQ{3XFGdd5DFg%Kc0kc7&Ic5eAixs~ch-akWIHy}hMvms(S{XMAV{qfEN zDVb&Arty*a0B8`j^WPt=re;?8Z|E&)>o2r;q?Juo)Ot>CTb}Vc-QdB0IHJ(S&n~OO zjciMF;>Ri$_<}x+^%Z8(4#4!DyrSjAId@td&+p*=o8lI7;gf}jR8#`<)D28}hyzY^ znZ_Olaf&op9N4wC$BD)HzdNU_vm!$%oh9glC}nbr9lGqFGAQ<_?7fr|I@npPXX(%5 zrV1465b$5r7hSUg3OTUbCP6y}3(?6Vq6QazTb!Gp9AnFO_(=JSickiRr|9|sT~i(f z4JBJ`H2y8sz<$J{Rn1OZNKh_#TIzx_j8d7~H|h+Lvj&xAK0-y~s-|Nc*hYZ&h;`*J zP77g}bL<*E1ZL*=DHn4y3eAYpFaEb;cvcp=;Md{B8u&;&HX@9UqZ){La>oe+%0AXG zOT>J<#Kf<|zauvsl`4bjH0cLrtKJup#(G|QO-?QmtOsJr(*{&9W80Jx5@%wWvaZ4S zjZuATn}@;*?IjgZ%Az}JQ}rLcbw&R3>`DAm1d@0{`o3@R9?5u-_1>@@HC zmw4uoP(t8DktD@mu8r>=24ridig_!$`*@Q)q4$aD;%>Lyb!hd?ANEa(85~vV{PpRD z$>^2M$Bt-;JNd)x6F{=9$7w&JfU6`HUzIZi?G_=ek3V_l;gG!yyMYbkNlJEr_9wLM zcl1=gMAp$7JE5)72*9^(?+6B+%2(YQYlxZ~M7A297c6p+3^!%ryysvN_X;)kogM6hoY8zaO@#Ff;XLSC{$3vOoWG>Mdf-AvRoec(zfw)? zA%pJv#(WZ0SWJS^q2i(ZSaZ7FQJQ>R4Eq>Rb{0cB4S9x9*m{hjW4~ocMNiH3;S>?v z9g;l8Qzn~W4?xSO$S{I}23ueL1jUpgV^y(+C9T*PE-q8x`Sjn$M;4--PcaNnUwwfI zBsCo~h4`jC3ZD9MvlkqlKt+!0g8T;({{D~Irv%@^Ke3RR(6ti5-|67Pc^~z@kZ|Zd zvAYrMuDNR$SNi7@HuKk0yK*lGSclPQ5RBU*93AcJu%dc#ePy=m_=lLnk!N*L4j$A( zR1}-N&sln9E!Y|?^0|1po%o>!hL8px@+NTIqUU1nsvI=>@5sN8@b*=v5B6+=7*AXs) z>DeD9j<9z`?~UuK?c+84$lEmCP3-TSk}Gym#jDs5oZct}AOmSpX1gtUB*KnfFKm@q zQYn4`&u$XBY@8D^Tf=G1$G9D{Tfd{Q0(59F6Mb5Y@LIN1f2Mgda^GBM>MlGX+ED%O zds$XF_z2E7TG8J3?-8KZ{ki5n1*cE-dn2AsJ5C2n4*{w2pjoU8Xmt5*;o(x<;SKfl z`i9i^59bQ8x=BJCLx7UDPwulie<>hpNw_pmj{THkZEp7qH=x;miGDH12r?-cvUobT zlBNbxzb`(5(q52?!2%|9(HjGpn3CAO4`_}^8nTJR7&wA#_5*EfC%XC6{Da>vhV*=6 z_WklGM7xjaLgi-P3_p5(+Jj!nBT2d!><$EvEMA zLWGi>%BDOMJ7W5&J&SDKne2ImB*FtzcrC)v&*Z+e=eb!cF~0s?9LobmHu3*O{N=uu z$oM3pHSb*Oe;j|sDWG6!v!LK3G5*>h$veBJhLAvdA-JZI^8*tVY@2`BY$6kcGW^1} zt+Y2cecJ{+|Ma4ijU9FG0^^ksA}Fw+{X3in1-|H0;3@qRnKGO9 zI9BJ*n0n*$)cD3G@7!N(&+5sD>8_la!7x0QnTem|?&Xu45#+HpJgqoZvp^-(W?vN> znF^$NVd1@mZGrton(f|GC8L)<5&N=>LQ^r?;o)LhZR5GX5;TXIj7wd!n?Z)3l-6ry zl(vpmF5hP|413m=7%}RR%%<$#aEDP1&@hIK_%Br8UyjWuWKCj1>)jdaI;41iH#>oqpF zgq_BNySC@zQFTEI9*kkJ)){PSK>wU25)WSO!wSFmiMvD(JaMx@%$vdi1I9Fpp1BN0 zH^@)>H(*bxX&He%?*otGhraNHTCfy0;e%Bci20vYet%~9#lN0S9Ww{h+9X}P$yHc) znvkZd`dr@&BO=*koi<3(d0oi5kbr068JDwvd;*-0~R**i4(B<>=v6c(M!=z*x#kq13RH6+2=x%I1L(5SolQwf#2PJ<{LBh#DtSy z_k&||t9ZdR+M7K@FvIU@xRLgq#8Re5K4y5}<1;`g33^7H&aB6K7S+?KyV!*h@h1u+ zUaJ5Qw%>?J(n0`k*YF)xTnP$W0$)I9jhgiExBhxM^+yYd_4K6d2k1ArDg=Ncp*%cx>?s99$VpQ5q1 z$Mq-cQbSS~{`J9qq$xrbu^PpGQ$%zI00QLUq`!npk6}Mtrm;~Ik}h0>S(Kjjp??tN zwi-#;TqIi9AKcU)O8ony)b|rY?~Rtp;AShrF~C~r$1qjZCt-^!*MwmJWnk0fBHmOw zrqS#^cPV@W(1(@Tc;HijP);c9nOxK<>JG2pXMun_it315_4-7kF^ zcm%6dzHqT{KefGC)sGLW@Y+AyBjNPto3g$vR-kVp_1;67w~`21v+qY8Bnc#s!+Spn}GU4L|g^vyW0WYE=9e`Vs&BV`{LNBMgmM6#;_NgRD|%4*}+$Tye7Z51)|03 zL5PsNBh7LWKv-Y(v&$df_n~;$=#(HVfnwRpVgHk@rJVdrOa7CKX_(ztXrIW$YhAD; zF1_`cB8G31`{I#*)(WuH@v96v|C=#zqjJCq?*)R!pnvuxTpox@Ph2X{w zVaRPKZ}OgIK5G)C#!@n5u<*-is`&c$@t_}V%z!|}*d3g^4S|bea=7;Uy3-V}*$#w0 ztn$Bvj{_(cM}DdFFyod63@~o}H#CZUqA-4d1WVVY6_N=BY11=Ew-`zqi@(GG&VHgg zb))4`@+2vyUmo;4^%0K8ify{4kUpF}$O0Jev%7cs*(ZCP0zeGy(o2V@G2g3x0319T zlF?=`_RDhy$FgcdIlg2C+RfrwXj)A*hQOBYl!_E1p;bEe*;*cBixWeD);;$u6^(Ub zRqYcNe4;gE!OpI%b92b}?3}EnT#(msl*#|?LyA-)vzeQq+0f$rH_jYAlL8v*8IK^> zudT5ulc^<_xHU6ZA<6O?s+})@Vt866_#9BZ>|&G72T9MFXM~>h_B`>3#^OOZ4U$0? zA60MCS@o-!4Sl3UMvF1r{%QHGnOt1-3g4Ne%G-w9iB6}f{h7mXl<}upqRcb--Vm(Y za%EYc7f}ZF9Y^=`r|HKU2!0VBn+-Mhu&+fu*1`)bWc4F$>BL%z=`Rgmu-{Q(5yk{q zbjZtGU1`>6VV$`#jwYfgf0p&~e`unB&M)RGMp`{Ww=`&{tgB-|&|DO{AxwO8rttpe z^GWiTAWxGsB=8xSYAoi}s@#nDPZ(np-h?8WY7_{>T|5+Ih`(KsvB)0!&4v=dX5+AG zs!cWtLCRO^I1I|_Q5$CLa#)NphTpr6wb3(k9#5_~yV5IsT5Y5Fx{nCS)2#h;N`Isu z5AeBV`upGifEEs{+ef;EM=_h=7l(hz0(KiiJ2D2pYYOh*lvhF$n5La%nfE*i{9uac z3f#KYTf~SEx}5wX!_NG|Hwd}ib|Zcj^I0ma{Ao*Tmqho*2LDVIvpv}ZQH2M7TnC{W6frmd4B-basc@blsR!Hm^v z*OAzT;su_6#;o7oYBgOkwa6(;d*>rBM|`Zus{A@YPKuhmI=jdgUrd^_%X?LB97ZlJ z@4F1Ck*{lwb8mpdBP!Xw!>Yj})c!dSj_vC{7#C6;RvUW{Pym)`^y|PhV zRD#FSQi^S#XJHS{l>Vwv5z+Kw)*+f)5QAg>M>}$9zi3=0KFlrhFDS$d0@Hxp1PP^@L{=&LinD=1{2!5Q4)9425Q5R(p4 zczRkqIvOpO73fvE#`{OWOQ!}$8jI0B?R%Yb;ba0%%PfASD7T51g>|SAX2eh8HbuR; z7rW7Czzz@UzQP?U|8>JQ<+wr2)o5R>iShxcbo{JC9FOh{A&LD;mmyKohhvG~@|BnC z`EK}TQd|cWEfIq4VH+>>FQYlXs`|h*z>ILko{tia?^7*&p_$|J5iE)Qxc^wT1!k%+ zGf1=fR!vOz9QmDBNZ6dea@ z=$h*gGdP>U&^|mj2{8}PRyD~dZyGBqhkl8M&@H)37eD=x0;->SUSc&9w5a5m;W5~Z z(%F2Vd={uoW?Nl@15+S_Nbf+bE#>)IqSxX~JcREjqQ9x_)$6Ty@c~P)N{l5wM=k!V z)|<_ulq&g4LBXfb<7ink%~U(?h?b%H?ZK}42sRlhV7m~AmFuFq0)EpUqxn1W|h$+4aeJ&@;zvDn$cO)U9W%k$w zyLTL9pAdnS2%=61#fwleMB^VE9V-| z%r@RcjQVwi=H<4~JbV28DbPY8$-4QT)8#iTVqf%?G@hfdQ~J$(BT|~Xe0KVH2FCQ zEnNn4L8#At=Q6VxxhxlkR)=92|J6`AR!gASCDF`fc(Q`_jeF-494r_EEJpRqD;CmD z5z*f|a=s>e*x=vp^+_|LD6_dC8_FE8ZSUdu5~#ol5bSlzYC^JiCE+26t&)cPr_ zZoi!ZR04Ea)Rj*2_PFx^C68kDL7Fiow20P)`)GG3eOula85ZOWN*Mh1*ug8)`raF% zi9fJ#TcXD!!Vb8JpHGVGXHio%y~^EB$N6{%i^cGqXeMz#`=-UpV!z=Rs1l+;xW`f< z`0cYsx&X0cJkq#IPjcn&t`_Edj!`z3XS(*mtomW}53BL?-F_u3x4huGd5%0A?Kgl} zfN3nt$6lGdb6f5e$St9*RM}J?vlW;*lM>J5kVCiwU+Qs|J=L2SrUX>dvfzm{s9hQ5 zX^)Q83_V;nw3TR;OW&aN&X*5<3k%9O-s=C*HG!cSk!I9Xv*%_2wi#l03Q(?ub7)W1 zI4Ou=l`fma%r)S*c`t%-eA<$vDGl-SF`Co)#F8s3kNLXdisqTjozDYZnh#ElHAL2m zx^H9Lsnm#SAOyR@Ob^J%oZqV9*A8x2{XBAM{2J7?CCqxRZyr&5@H-x7>l1#wo24$9 zQH=IdhZXhXgbE4RqmhF*H!wkYbgZH4CYA$bKRyCDGHF!qlF$qtk1Q1 z4;{^XP*oZlCm-GPzYas$pZPF73^ym$^1Tv`%dm&2@jM!+CJt)C+l#m2ZF^5j-l8_V z==vjYg!)HSfef)=1S7BKzDcs;4C$*<&JFaDNyi)!503Ce^9eQ`4cGJ=1uC>tn~(Z=M%Y@UkKMZO{k9+nLNuRP3nA6<{uwB7ST`e{=> zA(8H}XL9|cY00TQ@uQugh9pX@cXxcq*_bvwq}(n;D`DTzr7FDVxJ(yvm7vH4HYrQq zm%T8o$lQH~iFZ_Z=I)iUTyIijza0WMG;oVoV@r&s!!CP;)5x_S&iR}Wk;bS_`+hf4 zSCjx?0YvS6$y zI&%EnLgfhGJ&+nm-LwWl86$s+UjmUhBh9F}ZC0 z^Bi%+YVUIv;AEle+)H*NJ$S(Yc{=0OX^%6>xFXA(Su1a-lW)9UjCTB2TFXBCpPRdg zaJ~HEgA0coc`oi`nqEGzt=dPDPV4Q29~SCn4+oxJk*xVV6dc1Gijq_re4t0^Vt%IRfhIvOEAx$CdIxg!lWn~RzoikAL=W8WVI}`sY!8)Y2A7+r^FjZ&oyfqNY#x(0FX7oC(hSi>iN40J4WqY} zlW{pR2^Y1V?Qv}bgW)nCrV#BOL_Eqi_RYgBAE0jgjei-UaL1&z%|5oF!spCe-g6B` zENMS_Y>3|Q=}>ET7JE~#I}K4i{Anp04vT*-EroKiu>Zh?<4U~xk^fX@*@<-}N9kIX zl{B<6={I$(@%D6feNE~icKupY7}Oc^UV09~#$Pjkg#Hhm?48!r zR^3=PQk53_+GL+UNIwjsJ5)=(_*e>`WUQ{Z0qd|hPPWR-c?Nu9PJdU7A+F5X+$nj! ztCEw=KUqE_lmBgLKV0bwLmEZc@woRhjkU-j?^lb{U0NUXhU1Hi8km+1+sk%1sB!b4 z*L{B@t78T?gE`2TTho&U<8rEo79V|vic;HCrKv8rfrB&s29(_JdG%xx06!ls=N>ke)dfxJZ!^H!+@G$8q#$fK~4Ho%vD z(@7-KT9==5GBVF<=AViqNgiKnsIV@r@ybu0yVO-zuxxJJ>Sks^V?Q)ULr*K~@qDwU zlNThG$eK_!ux*g`VMz_{oT_YDn&>i@&4)0#N9JBRQHFd9uZKNpQ-}T<7x?SWDL3&i zW#m+dNQUPH3XG6a^)y~Nlp!NTnysB0xZc#uI%|haL5(E@hBI?c|BgVpd$L%r7IF+Q zy#1I+L=HpHwuYxpEu9Ry^%A2U2V2B{ElLwB!~K}GVk7QyS}GTrhpf(A#I7P3n{R&I zh6Jzgi)}1acl*s|NfYnIMsj3iFfjt3Xh!8^DzVlvy#d-}&j}N#xHDd3t z3E|+LL6sQqRbHNJ{uF=G@ri0a*y%iw?)JQ0Qf+W3iKgStOQOcVO>_2NV98>enxXo9 zuXN!LL^50zCAfVN8!+iNqNz67%$ceupsE4ee14}a2yPmvGXf2oH?QaC*_ioVagKQh zdoy0}bpGMX=9#uexC{0SK5E3#%Ec1+&K2(qb2_McBys;AJr;3T);#&ljaCyc_u4(3}vDAO=E2wNZ4>OkISuzre|{{a-eo+T&N zF|=}P{~hrT>^8p{Z1#6{q<;QE z2v`4Zd(HRO9i2W`;s+Xrk*y}>l44jfA@ixC^sC&LH!iCqF336Y-p4uPO|RVXmhe?^ z#lcX};uugXTwx+8MsTX42f7G*NDw1~k{0Q;De&ri_$w@aKZ%tyXe2157j&ei5wq6) z)d=&gS$&Ez^37dsRMa1+CRzp1hpl&E2up-fW*tu`;G|z$HkqZ*u{ETB`?iY+fjq4? zl5}%LVd;oQ@x;3>>}V~(O+U6FUkQdOZm3xuzM>2>SQ=$i$qMNoW`ppc484YR1XbWL zI&6}w(gqeUWs=iO7qlkIXT5)jS>-x5jI?&qg}k0Aq+^wi51qy=5Uh~@HGgt6X_C1d zg4;9r0Yet%M>?pOon*f0wwELdUd5Lqd5$7-a!}guL>9HbMr@4@eY>-Gm8K1=JIj)~ zvs$uu;&3&f`l>E7sEr?e##G8^R%gKvbLq=)qrV1gZ5}#ZP1ihqG&>2=*&9WaVCfGK z5KI*jliMpAxS?(1?byHF<$VgccyPgbF?OpBFRVn;XSh4DVZPqS*4=;0M*oP)H*0Ma zgG(5%{iXVQ6_voHQtQs@N^n|HrBFe z^ucUQdYJ*wYyHaR$+x4acaKl8=Jnk6biXy9o!av{E8l!m!?OBx+j%Z6+stdc3#MDT zuvxON>zNZ-ZZA%JJD56p0q8Bcp$~Y6e$k;h{$A9Vfi`LohbwHF2~nuCN1XDH5z7n3 zVtZjhc7<2ZrQq&U&TnF^J@QW47SE5+%ZOw8mp+HdHg4Yss<2g?K>Y)1l32 z3!ptfJaUoS<4Ul&Fj)QR6CTTDRa`%+)k+FS*05p9%4xP=!3?){jD~xl&WR2VH4K#+ znKgD5f8R~qL|^TAgY8{m@0g#>pVC97PKVQHI2$;P;54E$vyIKbdc#fO>P(e|tB$<= zs8s^R`hAFp;Srg&+_W9!}$O_Ow-#To7>tr?&oiXnWc=e1TxTxK>-c6{lCp^(fAM zUGe3`BsPt)m~fi4fxAAwvM*gUNl|fyJz!BxaBxE#hQ5s$l6260q7=dN16w7iKd-xZ z(MN6InEliB!@@nm#SzSI%29ptEn?BNJYekWqIOOMBjpaKHYSm;9p3V+R|^Hlj^n-^ zVijE?QU*4-VnKYa8J^<9-}sfW#0Oc}xK6ZP&htvzK4>+14I9V$o;}6bG*bC=;p1`t z;th<|vExDJ!*avmjw2Qcp~DIO-5b8$Pz($X#dUVX>jF%)dAwCYLip~#;n%TIB<)~V z?3tavnvdYzi&!?YeJE5Ol%;SfGBh~Ejd4E}Q`|ztG!z%Rs?JmcI!=w-AC^#czw#&; zm|X~O#f{pdY<~hQhtUhvCxB_&&k2>Z`d$(Fomcm_Q&!i2qCM{sb?w*%;y&+~;;pcN zL1J%$+o;m(mT4{t5`%!p8u=?;#c!`06F%W~8*@th2>!(&r0fGnEv6A#kUsVViW;g= z5UPI>q2~<7RCfuv#sj`h!+YFYcM|s>!Q$&eNDpD&@JsCAU#$xfz27^=TrC84;e; z>94fBHWG^e!*D*<#|IxxyBq*VWR*JddED_|UIF!DEk`j~rxEc9k4~JI_K9cyLI7m_ zX9NY_q_ynzsI}p3md}|JALp03UY8kL$k~J|BiD5=a86n7t0#LBQ?xm5{@RB84Al7OS-oEwa0iJSlz%__D0UWZ6^5>g`XN8F zkt9hHlr2yh3;d@oh%^H}Mx|i%yKQdIAkB()uLEZfs=4yl&%@VK9m~`I<5t#CDo{U_ z)xV<_Ju!gVxtRU))z`Z>QRr%@Ur^{qcs%A13(~BkRjzK}fNLM*8{}+_Ukuai+GR0v zTmLPp;`6*JxGwco^e{Yqq5t;2z>yqQ^d(1VG@466dThK)D_ z;WG#pDrit7f54Zt_@U^Np2-fS>VC#Hp-js z!|k@xf32woC`&e097GZGEz69r$prFC=Q<3%E$jLI*7*&2!10OMk_rhb`nrcA2=g)v zOE!rds4V$;QVp}cT_@-;>c_ZU8Uup{4NGG0c?*8+|M`-VZu$qhji~My`9plzIZglL zTK869Q|_~R#FAt}H})1lFtJ}gY1`1@$@Xq&I;tN}Q(IySq7yTo%_#to%%4UD@!V1A zj5kbQyxfn!G@CpROu4<#{by`*S1J0WSsS5bA4|AKpwJ)vE#&sp9zD$rqRw*;56FI& zsSoews9kQ#-D7N5J8;7%{UZax>vN2p=-2p|Fw@OK$9!(w z4v_3R<`T>hsDh!l35#Puv#@^50rSakgl-PtBKBz|W1qY_7iX)oRcz8YQB}POfjH&Y zPn;6IC~CY9t3zXFHm+GPiGLf4yBmLv_tACi-4u$^WL=YDxjfRvIC9JUw@g?=2u1X! zj6YtNq4Ebw7Mw*?EAcm+5DMJ*>-_MBYyw6b?|aQ zb9Hl6X6T!xs@oljJN425Ut(TUm`wZRXcu?#8Qd!IKIDPII3+IR1Jqab=s)B?atYx9 zI))iaHMV|YMob6mvaeLW+z;gm10y}=X%=hmDZ^G*_ee|eiwJyiu>RU!TM@@2aC?KNX_^*scXlm+nDH zoD6btsq1iW$v)&aSPA75CkJtRQ{;1!xOC%KPo>{m4b@#*51M_r`|_a3s_~`hNhyKt8`ssl+s{^I*o9nsBLsPhC7TNZ8MR z0C$P}Ycsws3u25NbPXDXziA>slQKD!{8Q9pv%;?$h*+2n3U>GoO|3je(Dc@&7%de~n3<-DR=PGOWtF z&R+n|0r!D_1Gj1Y0zT=oua1ZzyxWV*+Q$ZOWh;!mr`@Q;S1h_%{ zbrqNbO19CDiNJG`8OV*wl!%%BBXAoy$K*@Kq0<(_VrSJ}^x81TKmP*m37VIUU1bER zv9MsVi~i%}|A)Y3dM{;aOJ~L=`TLm-iiPg>X9ya8lTbo^GBxQ@5=zo{@Iv!RlfWI| z-@uQ=aw~}T^7t6YPYtTH_NNcH4EzNAO6he{63fE7L4tB51YPcQ>rVl9fqwyafN_rf z7dflqa}e8;H8gMZ{HK5$z|X)>q+T(B!d4 zzKtS%(gi*L55PZwhXj7Kg#|WuHN>vD9U_i5_4cm={{a30Tqf{h$^kkXejlMqc3J&p zJ^w}GPG4pLrC+WvB{Xx1iOH6NcAo;S0=I#iz&U1rE7GXY1SWuSph9I>Q~Hk*w)IB} zaFxD~jrkR(} zWX+r>AdC^|KdbzI3;2<+)k8fSmn&kDUEkL$4I(Cho4~KYFNAHb(t*pkEuq4`L7!n= z=lm{wy$}2X+$8p*>rblE2DPMp*7;T+^!#^#pNO11${&fjuB*(wQy!JaOZ6cpfa}0d zz(e4YBp(6Pxv*q_|Jd+LhyVM?KmP)*0V6D57iCGXmGokkg_qj~UGLLbMMc{A_=Cg?t_bGu@q3iv*7i?-LEw}JD7x94UC zzHrAQ1_*k7AGjd+UXZ231!mZ8(i?FRY5Vri0{0lZRQRONsXa%FK46-6xX+qS-Ohg- z_?5nYm0n9hwzk^>nv^Md_Z1kR`*w%i3sTOeW?k>#7;pht<95=v0xJ;o`!;Zi+?P@N zgj}WrX^Pk;=yu%{z!l&&a19tI#}#=H>*BE9vpN?J?l|r{OBmoGS=17U4GjtQd6{!R z9sVgWd3clad42SHOpTf=Wo`d!T9bFd`%S{$mdP_k*+XBVGItq;9ah%bpxHMFT0X?E zIidFOs76l#*U0=XBiT9sP2dN@1{XPLo607b5<#Zj&V?^p6Lt#Bh2LZ1bd-da?kuNS zk8#QDJtw~xfqVS@l3W`^le+RwNYnY*3_wWw|3L32kQSy?b_n;8)7E6WZijzv5zKpl zo;vu3W=1AR!q2=y3{igX0Mm5+Jn6!*Jir)miRrYjW&lgT6mT85MgA$s;ceQK|f9tS7aTqe8>Tm&8h_knZLJOo+4?@1hzaARtl zfzHtPyD7e(&msLx{Ag8&p|)TAP1VtB(zk8|HHNroVkJ&Qg%pOm9Z zUyc5wr2h@dPsp&_;?0y+wZ!-TVjIsZz!l(E-~n)&_veWYHb2-%*aFt618^Gnk&*vE zeo;@D-#ZiVXePfgfMMVV;8)-WBIj!NW+F?MOdmCA1NM!7egy8&fL8dPwy?s~aLRzt zRfV&RiyrcXM=#T^DUw|Dy1cr!tTb6&U1KPH4#PmsrS!Q>O3@`wUv*w?0 zMNX5eH(~}{DZ`;5rsPM=10UnEYnu#e3an8k%Nwh}C`XaUfza9YvrNuaIHeKr2u%ST zaRW5tJiqhx+tab9X!y8G&7GO7C$f#xP|rY?L7507){*{BTJGI3Ufe`cWmw z_gNP!6o7Hy5^#YeEQq>p$PO5fPRs~^dw@d`42MXL*;0~lndSDSw-Y@V24 z?A4N`y{-JZ=@1y;`Nx1#089WES)VCL*G3=GVb{N~M643Q;{yGrLjPz~vP#1Ls*~;)qfQZy*!28Ta?kG0ocM zhiR4wN>s5e0sQe~@3(6*o@AY>ef}vHj{C)FNrY|93>5PG6Z+JDZnWQZPgnVC)&`%- zMHVPdZTdTTI!)_mQH#&au>}O6{7x$9d>{FLtOx8QrUFv9ABm3$-a-ipbU9&3svV4OAj07xp_)oQ-8n``s=PeuH&f~+$b z21;eb!`!M6isRFu^{_k0XiclYFs;RAz{dMN0WY=!2IvhJ_;wzb>J1xcc$t^(Nz&Sr z5hPD?K|cA>l>|y9)P`vd`gmGT3Op8#`8>;@#Dnn|T$uqYl@JZfdJkWejxX+q`7y4l)32c61AL#ZyniZUXv(BOgF8WA zE$UnWzlP2$XnGc)ROscgGZzl!DrA0FVHC98QerL_u|Ft_-pgcr^V)rh-OR0?vL@?* z@-RUal*q4zq#eAlt@F zzGjR@Lx%N^$20Y_Sd}U)F=cECGsqR1Rk5Cg0=r}?Kq2(_ZT&(+yX#Pd zKFxv(O;kVRW-xFXUZXWDpB6+jd^5Y6{59djh`vhA*_Vo9-l8$~<0;$2P^5 z$@3kT#(c$?G>>R*8scb~pQNd!&KufGg3o9o(c~%-ezRvguaV{RXGOkbB$xtSo?tsc zF+ikT+ghR(*jN;)Pam$N^iT6D(eA>RpE75_g44DX`*|ysiLUqg7LaTA*%4L({LsXu zAlciLf6KVsluSh6x2-PF_9jWB3>@WkTP8)!9a|rj$foa1U`sMGmWAN%o-8HY1d|VEnbvIgS1f*M#ygB9cxL{>Yy9j^>N*ug57I)@A$ospp|1(63?9QjK3fA{vvQxZXak!1id~1)QNR% z%-+$nEH`f!-!wre8O4?n&vtP|a=mtQzC32RoMzYg>w@`H;t5lszwrWO_go!_BR14F z^SWBo$o3T({I!ha-QnKhUH1aJz#8|2sIfL1;7bnElhq>o?gE;emEGmoONFFMMQ+T4w&eob zK(om`Cp%}d-|B%s$#&gq%CF`8>Y)#3*8np4s>5?`k-GT}r& zFAwhhzXF>I8omRJi;3v!zY4^GaP-=6UZVBny7LbSmqsFVo)9nQa)vhl>~Wj#CHBy{ zn=4B8S}VILJ4ttdR+HPjw}COCyaqI;Is)?h7huCcx9e2E8k*WsA%dM!`|WTqO9BBs zudxzk**fcl)=yKeyy1Ep-DS8kN-S=@Cyt#XN1bid>yCd?&Pna%^OulBq$emgT`ZU5 z|GG@iH-&8@nP_rT%^Sd1JO4JnY>fQx3+SCMuamNebJh4iW>57RaGK*c_OXmkCdSO? zTf5V>CmOy2`_h7#%9EA|wXkr@aMqiY-(6rtLaUUwoiO2)$SZ6RWWKF`wtyX=$tEA4 zuA3)-Rh|rSY>3Kh%a7y7zI+^O=QsIna&Pu0U=wKiO|D7!EQr@u!Z!0`2E8eu?d?H~ zE#g>z%`Z6cFrT8uBRKlU5z*Y1 z$b(Mlh51xfC{T8rTs+^t|6MK#+Lkyf5#KH0D>MbsWwi_R{3EbNb5)RZm^DS2m%GQk zPOCs8=J$Oe;<~jyR>Yo~J!k?ez?_0^I|aBcahwJ#=3H9fvq^q!pMQ^QGFHj_8q3>t zwCH$l11rEHuX_g`dvBa|mo zQQ+%*{|^7Gkp7K)&dzLrc^?hR>m0BKG$ZclSfmfr62H}xq^2d3w+XzmYxG|xkc8!# ztfZ4ju$~gWU=eNM$ zz&m4rNJ_zF8Zl#9t0seZQt!ExWXWlZNQ#0bsij*eVJ$Q+x3~m6C2Vffy=9eEacdPS zOH+oabl3j+nxOG(cK)J(MhB8A2*nAE=)2AH{Oyf2F$?oU(#uegY>U;T9k+c1o&bLl zVG0S)KOzVerp&H_(> zXTT?;zsKH~VM*G8Z{g9KcG`CVc*2c3?JQ0~S_8_Gg(^zm=iJJ#zzg6n;El?!>n$5+ zFHb=ne@g{KgnfDkJR)jm3WlbAY=i92%dd?^#Dcs20(fq)xtZ{Pzc>n+1b!a9#}(ie z@Hg<wbwv7)^87{VM>Yq@^v7G^De&BZ zqy?bY9*XwWzByySU80sfQtpLxc@}5}N+P$ZCEW{Kr)2r{iu5B}^}*`ZZusvN@H_BG_W`DK zysesPgLtMxTg3fx^7|+7URN=+q-(uTBsUOpH0bNT0-gY`&4Fo>czx_8^Mt9*gWcmk z!bia03Ot!HVPYLo=0=$Oq|6-8i}v18{_&{B`vj;f-N zZ+)7=|Bn?kUK@MVdV!o5^A>l3dG6DDr)yoEiKyhaz$XluYxG3^|E&Gjj$~P~t&M%Y ze7Oo8krA00skJDPrK_uZpL2d*5#R;E7vMcXfEOfh2>!qi@HzNEXLomLky>k=89H1$ zFJ=sju{>nvrlZiP%Fg9Jmra{yHcgDNs^~M>-?u~|I{pe4GDjAmZKTD&!q0Gb2vm06 zY<;y@$3u?u4Lp?fya6x0T+&fOx@-?)gFG#9nf@i-?8g@6ZFc39496fcnQJ1gPFwf$ zR?_(OVgFOAI{=`#AO<{lhsROuaKu|?AT0pASQM00q z?=xASVe1jk`^2ReV9*f?{l7Lu+uXG~VrHF`kxp0_n&a;ZU!M-?f6w~;qks@sk!5%W zf5HERe{Yrly5t|{KWuAHrt>M>g5Pz0em%7QrX?Ty7I^lBz402Jz%}?q*Z<8-Z&RC5 zmD7))X-%%@sYvgCg4;v#4^{Vsk!L2LBXk{G$K$bq3e$(}Nd;8<9UL65xFDOU(Lebkbc$T&~ zj|JS3y1y2p#<;5GO_5e#gFoTUws5#FwDsw%9n(zzKzaL=cfF=xAIkHyENqx$t4?av zDT@DZM7sP3JRE`)?`T7!z;;aNMg7bCxFTg>Y3+c%K z+=8ocb6e8*MYygM<4ZUT2h*IDdbXa6H~ZFBIO~ognpDkyCp@>zun(TXHTZQ~C|2OP zMEzdCiSxoUUlEx89g${lqd)K9ny#6r#iuZ7w6o7ddY;z5zIFXQRfjLYNe#B95^>6# zr{BN7Eu8N&Mam2C3=Uf~-&OdYt}oZNmH#`quJ=EuLG4jVaaGpyFSxyx5}*r@b>&(# zG~9HRPK$aZ@43DF)B0QR7S3z1bVSzSjW+)V+}?Km3vf+8bwQAlW1=K@BRu>Y(*LeM z+!QcZN>ZEL;uGVL#!%*zAE_-ufGqEmE(9VK9MOS6pHlApKt@7Ze9O-xDLQF zJcpwy2rs8%yHxO|?C(bOP-rlyz+q$4m{KxtT~(_4dlE{_86nAx2K&}j_(X6Hn*kgFM`})V> z2%Hw7$f#Z4o@xEBc3OX?{XYV8nvO}gktGp&JlcgcJu}Akg90BeDSQn1?Y902$^W2I zx^;QAXK-JHCY@~`4*fYLWQdZ^^ILeO=C>W?UlJivoS#v3-YhBtPAb^n=_@8gsB%QJ z@l~~*r1d}Aosef08zf_zB$GlTc+=@;j;+qGx;>5BDTg85K;zyk^B4N`5?3 z?fAtm`hQsZKWn+HrfvX*-hpjDGt&Q~0(&W0;rFrZ&yMmR6|Bd+z$zM6_eQr5UpA}iq5xj+>%k~3bwRQNeEQ4%HruBy*-c^A?7Hcr`K zDY}lQ*3s^s&zB{?aixR0BD7iD52f6gQF1mE0l!h7>E55Zgx@gmQg2+=V_8kR zt9w3->Hmm}^TaBul#f{K}x{52 zH581i7|}9H%)NgY6|60tMsLR8yX&`uSH7y(TMy@*VsmRioZBGmvEBn#3l8`e-wEi{yv#<4+l-;lCfVVa4v3Gbh^!{!j)`Xt2 z_wSEdf_Pw)K>Ix=HEWVD7e_lkc>glhnb%^nYxhrP&^J+ITGa@|2dqDCC>^Qre%GH{ zyGf=p$$r-#`V;w`RsFNy^*1EHnwm6{{P(N>6%`mF#jyTC?_V=Co}S)(ZU1Jvt}Tpi zg*BOweSI=Jf|;G4ecjMX5?PN{_Si1= zeFs%!s;a&HnG)=~9fM5I&kO%ASBu6AcoG=>JAEdEJa|Ysx|ZlAUI`<9w_`9SDBW2d zfM}3?tpw?AS&B?)=6Q_`l;l9)==t5&KcxiwtRN&&**E2AU+=C)b)g9#(iJ;bkvwW7 zyP1(p(AkHTBQL9A{;lR)7It}lTF}39LU4=9ZS4PU`qL1S<{@JZjcMj*d)A0b(w`$z z!Q4ip(>&X5>z~q(&nd^=)+qZ+$$z)?kHSGXDAKKz;%SPV;khO$cl_jAj^IFzPH7tU z)$Uo=QOR!}=Af?Gnztfoc)Pp)j7WbD!>j_x6*1(ziUHqU^k+us<%a~I+?4z`0Sb2d zOzOvH3_h_fo+UNV+<-enJ)Y@Pf}?O*r15jc`k1`W zBe(;%;qgvBXdI5ic{nTh@-d|;-@zlbsNCP_4A$WooP+Z!bx#|`&Qrbr_FWDc9EQ_y zS`Vip)4B%RpQ=&k$xdf*2F}A5a6$C4nYAvYE|t_}^?uvuYj8~RKWjyhX$AdlBKO-RhGjP%vTBFP47F>h7JDtHg9ECG*UexK+ascnu zC3Z_)V#E5M!4V^EpH~bJSOO2V{zp66plLVHkSM38zKdyecd$>9myg`041C(`$MI4|C?gLZ>%t4BL^$vxO*{iom}Tr~0{1WdqyoQoPX zSkbAj!>z6FP=k~31$+yi>aUh{c}(5%PgVWC8e?iM{V(9}@Rev|QRjUs@|~4c$l$@& ztlK1r^JM zFx;s+51+x`;Y-PRR4JHLq&lzm)5_44PiX&n_zJGTX^q@wHt2KG`X+8{J;QYo|9vF~ zShYc;bO}8vua<2k?VcBH@7JO&jvn3AXP&%zQp!79V~A zG^2e|dUb1C{lB2~UlMn2j2f+|cYO+SxVcsS4LB>}-LpnM6Vq1fBGjCKKhzdFy#8&Zs%?9Ilm34v>bThr()*lo%_1L?R;GK+HNOXBQs893Pl@?-pj&L3 z7@1XAm)eh>`>$Vs@8JhHCn}zf%3~=dJZDET^<{4gU=l8fHuRkGAEQQjU(*n9B!+MJ z87{+d_!j;HK9QBI8t~R=vF$n)CN_0&4nBc@h&66X3Glv}Zu)RkSF!r3D6LAA&! z&=F(z=3R0WPK!A7qShZXjwP!K)MSrZeizT+ID942_O$+zjXd|&wK;0Q`7d1lm*H>l zHxYl<0?$)e4C7*SNk4wq^``-+;7j-pPU`y8(3DaxPvp2exBUzy`F{id9X?Z(;5pzl z063!nCQXHH%KxDB=Q}uNfB{uCE%giqqN~30g7r9Q2H=oXrE4R<(^87`Qa6U=e+<5q z3Lm$5>lpd(O0FH%6P8{6pTd8@*Kk}dc1izh>RXxB@KDYF-Xyiz^oWro-PAMlyj;Of?7F`^;zadqyu{QM^U|C@+EXNts$Y0oFscav(qxBdK% zn)kdc6ObEP#gs}y5tUr`TyC;I$$ow&)VhlFB)SFWbgk>FcXs1*SgesLdY-ZNhBQ4p zqt?Zg)NR2XKfhsr{x0HARS)tF?WrgrEX$!RdRcZBF2XnP6`Yp!&kbiT{k@vrWKC#w ztLaaV!>90>eD6T$x5$Q6go2O;_TP!Kbwheb;0k=B%g?m+SoYMYP|+Q^uhFHISpS@8 zL$APbn^rFwe0)uJyq2mO8)dbcl2^(5Oxk;PwDL^yt7w?zU2I?(j%xiEMSKR%!I$tEoG|9#FyKg#Wt-<)PWqFq-kS*2&hO`N1GjY);CS9C+F8Pb~%{|EL|Q)Q`UvMgued-ztVyQ;7{q6?sF zcd<-Wf!8SrO5y)ka1jm|NldiR2I5IsD|#=)-99ia_k9KCjrFc=`zRy3~gDJ37qjK(P3 zNN$&~T;bQ7l*lW~0Ddq1sT83?ef?GO;=YT5*c@DezbSK3HGslF5$r0Wl0_9H`@kq% zfbZlzN4EgQbj$(OmDE{)cWM2Z?%!WX{%m15>FXX;GsNSKg6cec3;zv1g-JuF>udjV zo#C?0uD^+iw+XlaUnq7QQGlFj^UDUCS(OiZ4XKdWD`G&`Ppi zpTiHj`_wH)?-)+)KxVWoWKPh)IV|F^TQyiWoMU~4sfBmFCm*Ma54V$Lip&6z(F`+RkJvEJ&RxybVnJcG)j8_vRv-}qBvu_hBy;#cu`5Hsm*0h$yCNw z`JPkfNu?dzmnA#v6*C5C z2Jr}-lCo^*f2icR;Y{}Q`yE-bnuPVK>NHsiX5ozLJ`D@nRt={Ty+vv>IkU-~<%QRK;3|2&+LNr{Vd$=rA-QboU)pa3@7=2Gv*thFvsvRFQ6T{ZSwC*^jsZMJ}|W^L82zLOIFvgpCGA^wPKU zi`Jhu`-F_$#`&95(Was26|3i)ndFYXwIW1>RORfBs_$yj*zsaGDFU*tPIXqd@P{v99{SpjDVOk~Mn%p17 zeJdN9R0F0|&)6VsPknc_Ej%lk#4*sf)J1Zh-gl`ClffV@>tC|14^7B*1yA|l13}t8 zCWW3+iDF=T7_tZ*$Td+u1N8>21aB~>gv$YEpYwQtRj;%g_6dXNn$5 z#^o_3Ab=;(PwPLd0~GI!y-Ue%b76_bp;lpi(vqVXF-|OGL{WQ5E!Hy<)^Y~Za1@T} z^4z!BZJ_a$iWIq`R=b0`RQE&v(*o9r$)vVnZpK@TDuzk&Uz=6aW!;J{9m9@`wRBYy zIH*j?aJg($Yg-VJWvO?jB1R`jVN1F&HjP~n{f%6*iW^l{3YF zWPKz0C}ma|$c^P`sD&!CtEm6wLY~dTpAcb9XRW3SLDQ^y3A>#}>b<6THOzQNGOKBK zH@wwQZxLp?-E>7X=uu4``d>A&0eU3u4}#r5SeznqD6%k_U!F8JG6$hWnf0LZ&Ivk7s`-%ioI+L zt7t`ii{46>8r@ibSutFUp7*7vRYTiF8QGGdx@SY{cLUxlR&Ux?$@1Gs$?t2uV(V(p zcv#9s81&79Sh8R4%Xnv-rZV6BEsTgFDudlH3msFm+*pp;Qd8@yhinZat!}DaE(G1Q zs$piiwa6%SFk96(Ll+*{tzsmD!?10rR3we#hzzW1T+tqed1I2`xzfY-&;{cO-x}Pq|%aBD>LpLElp3rJ~0gxd=eznlE&bDv-(S_~t=NI-7HcrEc@Z#e zGGYw0?E`aBaa2MOZG+|Lh6v(zfSG}C38kXz?wdX8_4IwKux_q9V*JXcG1)u_{m}o- zoiO$6SX2@1EIsp$J)^_fRJ917BW>6#TK~+1vhh^ESTqWgp4&RZ4V6r7+d&r=!9Lb6 zl3O*lZ(-;_u~9R=qHo(1VNv+qxW27Pz#-8-^t(pj-O>3e*GQK?y=;r!E!!T=I zBHvtIJQ{6~8*NzYhQwG_rPOQrnZ5eJ@cUL0Y5S59ruyA(=(ZO(x{jb-T4+#lQ8+8! z{nS*&VJI0IC8~8_fAo~~?a)-Dx6w|4fhR5{1De*qCIuSE>-ygIO-Y-uD1zUOw$`_5 zstyPptdbIbBbUKiP#atap=rN`6*Z7$mbZ7S96A`CV6C=ZW<_Oy%9_WwncWqn25MvN?b-Ls5-i5y5l2r6D2Tzi`QA3qN}@fNPE% z+7TB+dY`&nZ={{v8AUwMRvT>_?ME7Hid0dgkV2Q^&s*wZTYy&67|_S8{XVlNGYiqW zlJ}h7pY>SdG^K{@M}}X-lmx-UA)C?vg~$o}J$w1_ryPH2R=uC0+A72F zpl?BR%S>{w$f32-)BH|7y;W^kWMm?M4(qTYld++)9pM~xwQZ&17HlS4WE9(ag-+_J zeJ^`hm%GA@N1TfgVKbnWx_TLuZHk}DU_id%Q;|GY1&Z=QB;gx9Rc~Nfb0i>P z=`GcZ2oby5*RUCjp+B!hLOxUMfkyTqr8zoAJox}#!Fp3z{Lj_G<#w}TH-OCcKk3VA z`uYWU3ePk^U)7UcbHHU~o|;0&coBnCYvzNdjOnwdBUG__Oxx%e-a@k_n1ffEbEEkF zedWcVBN=H*erY^uUMyyoDn+y|L)<5ju>yY2FqS;ol<)Q>#@=dSZ6RyV-xDQVPQD;>+AF%-y zGXS|6u)Zj*9>ddk{yjW_M=H#R2QJ%1tf#8LB0PqB@H}2%L4;VR)d`p>l6 z`qQ@V+>7u)a4MVfe+G}#3D~fU>%eN_>k0*54y}Jxe{{MyfEtPYp8mZj=H1(HC)WQK z?g@OQETX8~62*R`lec~B253PzFUJIe=~e%VW#aor{_`RNh&7QZowqhhEH4x(xRz1Z z-4FeL1NY#xkTP;Bdd0tIG6dP?z zye?5)gWnv^WuRcIZa~6~VqJWo)B7B*!oT46fW(Tz*@Sv2;@TC!4J{2H-GkrYXCK8+ zcdLdmUYD#a76I+5@T`7^f5BCUqlrm@TC^uwSuoN8YvRrR3I2fRY5lR(QT-EHgt+*m zoBu13&j0SLl^vDL##Ben?atp;YNZED@DTojU*V=tg7#F38a30On-AzIDfR}g!hgaq zK^)i8=1pjTdT8Y+4t@uB;1~Ga;h56-9x;-dirKBq5=rY~p8W}~h5?Dpa7+!>euMkrA(DTsi)X_-D#GtHm3Dlk{lDr^ z0$nQw)~&Ctqwq1x=da)f`~rW3{6qhvmnE~hg$p9&pVq&rKge~DTK`d8f8!LTyVGAH z9(?cmQ?tc4^>5`BeANrnFi+S{hdZ$f#>cJUJ5Q|$i$euf_%jw#k3yrfci>m}BjCu=QX6VU9XAyDww=E|tcW!JSGecv&&4w} zX*b}$T_1b0AJ0U5dMze#2U3BCQ1q)sD-dB+$p0t!E#QD!_Hf+YAbs_Imo0@+ zvj3|o?{g1+73uuDq4h_YP`1U#I9*?Wnt#GiBK_W6zhCllGre(hU(xjOb@(Uz%jF;I zFWJvV_|=#E-oRb>1%8AZVZGC*X`@O$M%lBr@CA4z@B8~!((<-M=#z|n1EXIR?ej12 ziyOf7eDt?DiqSx~nPmMRi?saK5Fl*J3U@U#x_z_$>F#?KeuUrPiLbw91&|0oqu3Xj zvu7gRPU~-m{3BagDc&HWL1T{kM4U*W>&)P5u>0E(~}_(c@kC8U8usJwyN5GQU=~>)SHCfWP2h@awknFDY#_ zRQDSw_J094;3wIi!unB5WW9L!I*NXmb^d>W>qB6M>&32-xchbN?zt-Y|0w&@_4T){ zH>+$l6m$d^rGNi|pWOisbbce;D;bb{c=!d=pTCCe=epfgE9Q5*l=(lyYhjrt-*pup z+XZ{$lZqhg*1j9vB5Pv2PS)!exHE)m-&P|~Q^IIVB_r){Q`Ym^Ryg3EibCt!w4N2K z-iWdM5BL%OfCpPq$yY?#(6`EaOG|q%-~XQqUxwGeW`+G2plm6@^GMz&Y0vQbyDEyW zTf7hrQYlILK-TMzZ4rT2?f3g8yi_8826q&DC-+}ie@nnR>sDaz>DKlZo+$ACH(cEc zCE63;MpGRk>mt!v5*G4p_!a&Izrn+;Xq+p0K3Y_}`il3$GrtbMz)x^{2$8(2hda75 zL|;cs?|;uu@2jC7?^<1NU5eP03cnVX^Hs&Z58eC3`q#9CjuL*WD%LHEbp7Ab|K)A< zzis;0(*C_uj^G#gH~i@^U?KlCYh-KN#iJ$Qg7p4BZuOpVepl4W5Le!47)$#1r>y6# zq4h^|a7Phnc>kty6xZM<_!s=Olm4vf2#4KlN_ekDdjI2)_Z-syjyb)K(5@Dgqx%#7 zr2t@QX#HzuJlayBHPhc@KjQAWCe!QVE?X!mb+g9p7<(X6VYmfC`6eRwnrhDGQ zL%0by;n7Zb+7oab&cZ420gNc=o2sNqelLckAD@b5+Rwu&Q6lt}`FH|%;odImF9~Mh zjAnbsC4x7qvpm>E{-badPH3|Hkbn~xRHR6yf|ol9uL4K({_{cx7^rmgO#A<0cln=! z3vgD*!#%ZKy@UsF7as4#=`jW;^!o=55qU*<^LucAtMzyNISMDWe+M;zvSja{_OG|o zSEMofGezX;uCSEvz-@TClm3jtF@f37+gw+QeIHByFLwIzhA=r!XtsU|GL!#(xU-A? z0&|l8Y0U=bR{Fau`R}OzCk%FYLiMCYcq-vP*$U(wpPG@T9}#wCP07dP!*0Xloj9FJ zFmI@M)5eIltmo7EceFq0{GJt{XhlVn7iv6s1g~~Fzcpd9oYJ8EsJc|%!VBr|j`~v) zrp{?0yNpVI7L8Bh;ZB@TV=xb=WIrZs%IdvhxI5dDvBdh1!bKsyOhJVb|3?Lp@6`IW z$0tpe&j?wes{F+p0WkGgpd}-Egu=Lu(0RzX;OSvvz6fh zUORx*PM@-1C1y1&-WH?Nf?B`!_)Kd3Q!-%dVibD2JDpmre+CZNY<5?j4{u;`clnnz zSUW3VsjlEN-$(1`p7KAS_16`gE=hlu_k4atq*_yQAQ1<@4FI~`<qsrWEFT2cX+!{gV>@n7}z!CA{~0%D)7YTKu#?wWDq3 z-Ja(!dL@DG0KDi}xd(pDL|4+qc3wxGi{ea((YF<~@!NLBY zuUP|PWP(3<{W0GBG4ijO{M#QY|BA8cv_68gyP_rluY-NSC%0ax-bbuIW<3WV{rh$O zep^Y_5BOwO4UeSEhsm#I<=>CdpICpCqCenMmj0I|zYkk~%}R-FOAUX3{HoRk_`&<1 zs~Cq3%YDG7Zu8t9v;K-T!F;6t)UBS-`5^h1gmhB2jOd5`zKmaKr@v6M*0T0TC}`aX zOFqWH4LOJp6A)4`SC9`|fBgQy3O*n9`@X>+d_ad3^n(?wTy!uWtRO=UrlUgs2gt8( z4YD6&;A{UC2bJ3a{TTgeNPZtF|6DDrD7yGa2cLbM{)B=4NC#b3uN->+4>s^VAMga6rxW26X z#``_57{HVmnT=WSgvZgV_`p6>`B32r4gs-5q!U=8fHT7n=#6| zmTrk}_LHI+u@q5S|GJer7x!HMj8asSf(Km@JHgxiq?jg({cfmE_)Y_*d&_@DV4e*f zfK^r7-|nu7Wu=&=0**^94lnm)>`tF?F{UT|Yg=+@Wk32iD)8C^#rJ3>YJz1fc$ixktG#y%oHw=ex4wJggRR5aK9PCPxk=}$!uPpMaLpcLKP0Qlci{!@mB ziOz4z5AQ3*H>(tFMK|nK%Tnzo|6IU(Q{vt0$^N|e;jrE1cR(qYhN{I28jjjmihEiG z%JB{2#0}$@&hKuXG`IZdsA_yG$`5`(if=|=F|g6_eWi#~if>kUy1ANk-st=fhJJAK zGXY286dV&Ge?#rPQRKL@6Jv3D`y+53K7}(Pe~HPIr|=ja!i!yWU`D*lXEbGry&BOhrA%e$=qJe(68dgO?s!nH?{`IrcA4=G2P=>R-~hwuQN?qbFcN(5(ACY(@ zsi&Z(_V84sd;-sQGUHRCYd<0R*9;isDcshG<@WNg!C^Q7#|7V?kSAia_$fSuXS>M% zknP{3f$KEYXOW83+w1=XoYLW&6Lnix+5Kml$J|l>bII?t>g=h=+SY{8?cJL!P?7qy z0NfAA1ishr--E|Hnc>u=a8cY1<2t`f+W&_t@^3HyIvj;FQr;*sFYEp9YyI8b*MCm* z4@rNL{yu`o8d2LkzqxDAMfd_PNqc7$0k3I5?*>e6ZL$yTIRT%+H{xBMlJ>n3wnOT3 z{SzK+y9e`d0j`KUa!N(MHAQeo1ap0Fs}4F4eAp(6P?xDM0s z*Dmt^626Ah+Pp}!uB%h`_nk=VOK=1(N&cr5L6lJ@OY~Y&EPnErs(OY>Xfe$$lbu>42h6nqX} zO8-U-`!DS86x86+*7b+{zk*9TYU5B9N9|L10N3Cu+zI{3L;C08d-z7NU)(_RAV#%nVsS?P{UbeGJaQ zHzF-Rpox*H(8(snGjm8*;i><49nQfI@IT-aJ)cZRMSTZH+*=MO zWmVI)>gJFSI|5hWTlfsl!XaaiN-;pgQtExU?az;D{eOeY*5H|5zOK~TxLk4{9uJ+~ zQ}8u>3l|kTkLw0C3CExZ@8O69124A6{OpENdbL_9AOQ1L;%SpOCH0WQNKr2rdBp&WpN zV#P~Bx#jOa2It{#@DI_()@(pDZA5fE54P%0MXaNLgNt&#Da=kH(*9etK}#&+iDFgYV!!;F7vqQ+A@Gq*U6!ahae;{^Oav{||7< z46;tD?(&JGTScDr%2g~vv-@|2?H^{t9<9iJ_q?n+slFW+B|1x|F zUkI^gATZIg&;mwO*&Jwkf=w>q5AJj)u~O-3S<{J()qN?zr% zI~`e~x~7lXLZ@8t^?xqfUhi~8; zIBsYwWu3y@Xlwc^hTKc_*I6r&D}NC5>VIKM0rM8~75ADRsRr{i#R< z5l}9`lI#D39O!p&){>98QZQAqyv2oLAxP_!{#=p%A2jU$zQJIRDF7-fC$VXN=HMKB z0iVKgK_*trfaW4gEUW2o&BxVJ`2Hm=CN9a$03vhMw2?Dt4XuA1&cIjj4V;sa3kZm+eZ_=bOK(B1 zoUGSp@Ev>t^G2Q2H66lcp_BD}7hpvx{QE(q;p6s;W&LlQzFY=0l{Gp^fT!UH_z$?O zGYqb_WK;aP+{+tyoe}{D;0pXV_)^vOYogy$5~^ZEg5A-O-ad;W>sWsZzrR)TaLORQFwD-X5i%~T zrnzwmJ}%z-i+a9X1S`yo;GZ4YkM*!WKfo23*O?ty0Fs*lOC_Z=y814yu>oh4c|IWn z?AE(4AcU%dxwgv68~Sq+zJtHRC1pNqCWR!3u6l8*TEfQI(zKo5Lk1XHD%L7W=0+B@ z{tfwGfbZpfr|ru55s1^9vXQZ^G_R%=;DlP_x-BC$>>+9_%8 zz-HdNhU7A-LhF$r9iGy3Pm!6-2lW;xT|@OwxVCN<4CKSWx6SXjA8`|1Y&Rgn&ksnB=KNvjX*kEVSgTo0 zlWwfP0dwNLpEB>!HUN>Tcz#or?VMcAM)-AJcgMQHwDoOTBohqN6dZ-)PTD?cwfhN0 zN)ds_0UT%{Nd*o&>G=^j2PZWhni<^Bz&af=^_ZPirgNikc@ZueX?I+C`euMK3F{b~ zha;gs({Khpg;Pqt#IhrZjZ@GNY5kE{{}>!o?J)E|UcP6CIoHcxQl7Vz>H2YA>D7T2 zAAl=u1CE=9V=A$a`1w60v{;PVp{jn|x~yXAu{tVT+zpW&QtwE3kG=^%gx6I%>1>?e zlJw^cOdCk&K&K%3fzvM5B)@~c{!?&Kcc+1ITx9YA4W&?L;E42p%FpjvWvIrCuxDVP z?L9MrCtd%i;fSiXnqGNVMi~I8gRoRr#7o&+AOkk`s3GYMT;3t_#LwZ6IP?&P~Zbby4*Ehy}s=yYEGSj zW3o0S)HfD`>9N;6dzsvV$fN^F;Psjq>}Q1ZRM%Tzh`9|z@f+K^{^=rZF1mA$>n<2p zwX8m~gK+>xm0}(-Zj12x6@%>@$e+z>{k8NT37BE52zDM%_LO#5m%%(Ll6Fd4Y-&ln zUoB#%^`!$biksRrHV(j1#Xgzcyu*Qy=m1S=|4zazRFeK2mqE*I|HBQ$VM3ndkUM~J zI4BbOiiHc6?JygW4V`lba1f5Ea9*|W8N=I(wTuc;WDe?etv}s;!}|A&aM|eE-4{uF z`Y|$*&L5Ks_=cCZgYRp=L7g{3|L1jg@GDa|5G+M#%q=kz#gKeF@hWKBx*a zZopID;UN`$%0{*@u#0QW?g0}Dm?pvjOl$u#!-nlE28di{Tlx5Tbt={B>O4DQKtRhr|*I6jcOz$BR!F?35JZLcGz8YQEWL{Q$6zrgSp<8n}ePE;mxa3yCwyy z8scnb@*CKNEMW=Mt8^yx`wjIhv?PnFQH4ZOu_iY%3h#15iLRQL%q?`O`1Q26pr7#E7H^!{A1$AK| z>9|NK zPpZu*hi=34C)VF9#{PXdmR$OiN^-Rttx0vQj}}wk1AB`o$fI2Fc-S8g;Ibk{LC-u< zS(%Ly?cYe~|D=irQFQ6pS&lNyh!!=fdPKEKb5z|8W%C|=y=}=ndD540bpworC76)% z^=0aM+71s!I>M%(ko_z}Z$wiYqXxBFQuQVFGIXh`V(KVV^UQKhJ$-*{Ku6y+Fk_T^ zVkura^~+au3^P0NJ)Ng8=}}-Wqc6kk6Z)5ufe&`ImabXiaz)lGW`0WYfM^@~_V#g? z+5jTR_54WDM2Fufn-0fe9GD50)$l{0(?4S&}j<^bj4^4k9E1Sj*iYG2Eevy>u>b2`JA`r>V zvh8-+aBFM2f#y=gTpc;-%oXw!wW86>Y~@|!LWv4RRyLI2vXzVC2H7{eH(pf49NW^& zuCLh^yF2vpl28{@l1>pQx@70CBSFTIu0S4_<&Ncv{2p2=p6HU^WuqRK>6FzsWREwO{t+9i}{?&Kp`+k?^Y)vc_rK+$UM(lHBmdoNU4VpiJs1Ds1MGD|XVzG}GDSZ&*e*j=cbXRB|^?%5rwYd$DULfJ-hI`Kvkb#!!V3pW;- z11zsFu-ed|ul;MPwLMcMAoi@J`+whZEM0&3p?4YBsji4aDy^bjG7WbZ~j|f`PG59i#oE%X9{pSF&rqc_fPSBlw@(XCp<8XDCv+S7sk>Lxo$q)MZP;HJS(-oTGw zWJEH#D2i36o(;dq}y4@7iMae{r-MO)y?26=k0anv_Zi-i+V-_2Wz5Om2sgcx% z*9q^@w}eG#c}FKEe8WJ{8XGC~wn&^;#4NDuh6$zN%T*Z&6xC+rde^)_9byRdm6rH zMl)PrUoW3Jc{^?OqbyphRBjY1p2_YP^=BPz#;S4dME1998L23Sbks(g*1MrU%i7P; zV#UF)Uu9Fxw)&d7Z6Q&v+tqMjwkrPJU4PR_&)3v5)>9cNeDJ!CP*s+wExlX|8_+8P z{D%RLbS!3lkuy%6pq-X_YCAe-ab*d=mlb8v6$!<9oZof*a>ZO<+&bJsCGBeJmAfI# zFw2=4s&4VhGEfW9vXjm)3F%|f`ZmJ`cg-ZXj09pWNZVHpqoN@P=83evxvrADXJqJ2 znWbK>Ee<%YC0*6y())L!*SRMVmKCO#6zi323p*NWdz;$0<1n;~t1TB5)@vwTZP|v^ zXi5dzMf>Yn8_XX>4BvrX))a&Ig5A(D8x(i$B-nNR{a2|6v#-;*ZU~QMbLAe~v^CQ9 z2p$FW@|t?`maTT|a?ey3uS$(pgv+r(dLAG%^&(?W+d}@RNv{eQW-YBhZOH2)Bw)$D zB`own#hqof^7hg9z1A@-DY06&eTz`OE$h3XKusfkZWe!eU52fr+F?Vs^*y{#>+fmG z@P+=UttE`uRWmNUeNAw@ht;IvE24vVClQn_@9I~fT!W2oMDpHR7m40HO zmw{B^P2d{!VL>GFE3(0ne`(2L`pvzgHqwQpKP&JGp22Hak_dVh0ru_Rdzu4z6Z-Q` zr045OPmWk0L06mCG_Sm>fMG)!U&AZyf8V0Ut_4N|*{v0kv@a+9S+};pmW4}Qvv_U$ z>b6wo9jvzuq-Nb3e%MlhF3EMhg%?2@-V;vMiVRq0)(4qxZ|Ux-m3?ClVpZ?6AmR7T zVs^|hmgPNO!jlB8v<`3Kna*P@zHjcnU3~v8Ea`n2h)?R7<{HYye=1h7EGzn1a$KY} z>GNArCB-Gw`;oEBWm4I9u-+7%z#H`llq`&itU=rc27={yndH|MQpj8LJ0AK*BDJS9 z`=ZYOYOMbiJXKqIsknui{mVc?U5NlH@4`#CqqfY$Hmiv|h__=bi23D3AWAGr04>wF z(slZ&Z{va(ktyPY2cP{G44BQ z^&&ihCxV}-6)jdoV10$#PemHO80&uy58#4Ib8&y`SnQGhuqo2$C+Yn=@CxqQ0B5LM z#4$r&`?U~RHWc{1a6VFJ*qGhLy(40v*zcu4WtLq2PvDh$gZvW_NO+HzBL8>|{ef_H zUJ5^_A_wFdF%$%^iuC?P$bUf~F-KM1i(5xnkBA1_cHlSUzXH$UE*!BgC~sx=G-^-m zOB?I&!y9-258$BgXd$qHg4hs5OWx(3*&4YPppykI%MvQ*IJzp4LE;f_X_q7kbj z5Z=&;q{#0CinSsC2mod^vJ%CJNL7^NIu->ARgix=Plr|44UduQP8V6sHNF3vIRC36 zq&p$z=Dg?tt9B<}7xVbN#5&p&Ov~fcRZ|Rku?s~cmBL}K4axtAAuxCw`oQ*YO~3a% zCVA8PZ^BXK^SzU}Qp^RW0ziWHc>}$k*tkxshq_|bbWeUIDJ*HfUdQ_1!((`&&eRdR zLG_9bAoS;fBZ{plrhWzUhLY`;3oU0|`tu;Izl@+dQYm8G+G)}ohdf&%o&E!UfxDqV z9W@5bSU*(BPESO@DR#Ln(&QI@&>BV&R<=Kn&W*Bovu}ww?P0Pl5g<=Ufjc(FTa`;_ zYI^anq4l?my#yl)VoPQ~&}ILL;7qOtX?c?Gh?>)4Vy|5E<_OK;F8mC?!lR_89rZ>{ z*c4b8-P|xOR^So*2LFZ|VZhr;Lyqe!>cyxf?i9=LOvHtM#ro4Lr2fK59RSaH#pzuW zs@AXYJ3NUrcwIf#(<%TA%wtFF))MXOPjEXNSX*`BQN!N}jgP!_Tk~IO|No5jr*~)= zf@E%~=Qw)k?sP}Qr5p0^s?abY{3W*-eY;4v;FY)!Z@T=Wu39hVn#*>N>IzZnj`aUI z_6v34eT=DhpyOF~+2AFu|5tbz@~;VmbkeGW-bIogUKJeAug*HyQHf(zeb04s;I(4l z{6gHJ1^w@9MqpfH)R@r>Oo|ll-++I^uR(j=)1d6Q-I~1;6pbcLcrN|95e~GioXezo zVWN7GTO^wF_XS)P?e>e%pRQUfCS}6P7D`7txh>xN-{99kGzHjYd zr_|Pf$E56UZqGy#1YO^5!SAx4gRqmMFlEi`CG+8Bi(X%u{y%_DIgTiLhlWM4Re8Xa zHh3!9(5pU&JTS)Q5#1oXaI>x?`64`o-zB|Bdu=2?hVlnlv6m&h$G_p9a3dxeJC-LX z%h0UrU}tiCi$yT17ry?sY6vl~R#C3Ir+-O-=XLmLEA3~~Qv7A>JBwUKx_-Qadt!dQ z6$Ydu`PHlw)AtZv+mihM3BNh*=fG+>HH|z_Ou#1V^F*|%zc}eV26tbV4iDtR!g{R= z59?3(#c5;1#MG>raUk7ZH|&TPA|3x@h_q{3F;XeF4wEnmz)y@0+m1m&9^|@$+{~Xevrlb}9kv`AbH$c%F@GJc4r0YZaGiIsp zD7_4X>URr%grA&rKJ>pVN8znjag|t={Qs%wqac5@|K7|S=AbD|)$1a?zdxiuJ%zg? z#r0>(MXthIxC_4yIiRkczp6E&#mYNM^F7;1{%vhxLjerdje;_t?-V@@>rczvWnQc) zXA)3Y|D^vn)A~08MbDdh$dOs#_ej?N_aVxAysB?af06fI)wS^f{2|ig1%G~^Qna^x z_l!n5-F%Yv{8QoI=4VYsooLU9CXu!g&L-b`bt@WZTi`!UyR&t~gr1U|k9B&J_AmMK zeLdW=B}Bb&NpP-d_kM)yL(IFYmhO#eT2HPk1-|!+1POB5nU=EARTMQsfc44V1!I(bW7S_#OVGyIW!XT`K}tv~7{k zZo_N11HUVFxiy67JSf&7TXwj6Dl$FQ-SbDdx>f$`I!4(R)3>a&#qaW-1@D87xW9k2 z>a0ospPKyc4CznkzqfPs?BbNXM_T{C;o8>qhtBn-|4nVmn$m=Sz&{oHZKFRC#ziYb zTP4PD{XYo}^WD(;*Q^x;OPh3+HhT^?;V0?;yP@^BiUZ7ke$x3%>;F&qc`KS{*Cr36 zSX#1+%X@eZ*Wll}eir19-anf6ykfMfr2lR4-d`QEKkLR=A3g9b*`=@_R~0>MBmZ?9 zqVfRRWA|I|GyJl3{R90})Jme1-jVly0e577Hv{0vH}FiC#Fv64nAT{|vXZRV;U+xU z7EARtT!nRbp|SqBj;yIr_)w7CZ?-gcA~B zRhg3~a1(B*095!4Em)NP-4Z3iHu^JAiRL^U6|6){Rk}xT3vTU1n!YH2&eW4|M#va_ z*`VZruEYJUSl|d;uPsrZG+2VTzTJi!aAz0!uPB*(SksZ{P(6n0aD7`Wb6^2(>ZhK; zQ30YNp*)1^!t>uo|C{gt`bv_VFzn1IJ=}q7aBC=fNS|lYpJ#B)2BQaB|0B2w*WuB& z>tBZ3LOwnx;N9_J`r@Yae;fTt`rlKP@T54cP@C6lxF;;-ZPx!v5ktCZ928-8*pJx1 zN89Spd&&PDTo9yUR9{n_>jwM@ced4^mfT9}(K&CJwoN&}dvFtOyZtN;ea!?M5_0X7 z!PmWnXYec)4%qoK3bSxnq~9qYw+ip!6}*h*p`AV%9DpO{fI2Qn1AbjZtk$!b{B#yUBkH z4#Gi^-j6Buo)-VyI%T&1{ohUg0rPQK;3y~p zkLP#O|6J$iFdWePca?ol@?YNN`AN~4&8nU?Pz3r?wa4x*&yT`kLncWXnTX1s?MaGO z*7^@hf2vkEPv>`Wm**#7&KTiFWK!RY7HE(4hy3RSPBD-JcnMF{?6u=36U@xKz^4b| z19>kI?zaAE8LwHxRchK1*kk=8Mf+P-E6I{N-Jb8h|0Th^9Mi0Pt{cE>S&{cc4{Y-t zBO0`f*3VV-Bkt=Hv$ZK1-O@#PeV@;#nrt-Vb(Bb2+1+}?9*zlIqM{VQax`u3=To;# zXp|H_VEtvavm^u1{LuFwGeq02C8B!!e7<3`tpi<0*Z2SX6(!?ix@B$u@~5aj15I6g z!20X<3MFm++Wu)>hnhJ%mP_3GQ_{u>J!`HF-eHPQ7Cl7Aq5 z)N})hM)!SvMhy-b*{}~gzoS-93=yvGXZ>z}V*P7M6YW?2V`hKaQo*%-J|E{do^LA0 zytfG{oAJvmk@f-d4{)5SJ@$E=@Xzwift z0>7g3anF~{z17|BsPhtp8(FFeVhbAMyL8jp!)nON}Kh?a4SD>z**mwp=NO zG@P-wlO0pDTSL7z>!PPv-p}vXh3Yq|=Q|4W7xxonmDMsgVE{18nx@@*{iA~Zhw0)!L1tOaf|JHgv@DXI)jgjd z)$`-lV)p_2pTo4lpY?4tbzwjIpXvNeD#g^4^6X3hN37X6y6l$s!&o2dA2*!cl5QAj z3-`Wcw}A*07>st_SdPr)AXOkf2{@ z$fgkTqT3^xnQ-WpA+dTrim9_ZDc@;@&0;#|N2Z!`*n-YUoXY>4b}C;gchgk{BwY>y=W?Jr~%(J34k8gZQdbp9U0)BWU_ zPpe2gVl}GNWbkMg(s4v5aZG^XQ=(mdr#^!{^yi?Uzh_hp?S<#yO^;_aYr080QRVy-=DDW?Wy_Wq2{@^cOx}845#6^3LNPYxvnOQyYP4y`IiL!pT6P% zRLJ>XB*2Go6K=shc)qO;O~QHj57K7-2|8Eq&6Skvjh zqbmD0HY9^%@CkgXE~N>>hiR%&&;Jg8ZcDn{fD>?8 zoUik8fE~4dq$>YkaAT+S&%*`yL|rDcVkAh~{{Ze=m&!IeG!19qvPjcsHTO2KHo>cK zecMK@x=5!#6*qIukfPFj+F$So+}*azpe*@+ru9##?_^0n;E@`Io^8ASIk*U)!Z{i6 znl(!%8*&3~?X>=r`u(#ia_4fu@6_dReWxyyG8~1=@QGL(CspHUS(n6LaDCe@v$XyT z@EKeXdC$0toQu|FaSI-9b)$8WmVaiP*OC8PQk(IgaAVspxuid5;SzizK9!no0E^oH zq`!~1J-<`ZpL447PN`RAQPZ%0X#a-gKMJSdQ@AW{;1PokdaV|x)Ft+M+w~uYb8sF` z$pnn4%#wcpuD;^oPUrU`d?C{LN#)4fYV$p&{ols^RNy#VhR;R1o*vFMpZUVS zWjFz!!WG${18PxQ*6*h-p(MZFw(CDD{h7D9y}nwH?!a}px^1IYI=@%+`v>d>-PVNF zpYYo*_U8y(f-m5*rtVARoYgJxl!B9K_zfNpo#8P!1K%3)VGKN{&gnHZ4yIFm-}kTr zC*T|S9xjNsG`g+Rl+!_lr31Ou!sm!c&#$P#vToVq4lK&8rm*h8&>l>|MfhICce93} zT+taktV{hM#*AJ1U&FU>$*8}g$g`@(&oKb5ZH?pS;S=~0t{C-D9N@!>EMo~zhMu1k zao;y$evOHrbOsIyV?MoCVbCS*{%7zFe4?52Txr-xWh zzr$x@^Q!6+l@~7#v98v4r0jR^Ztv=u|d&1M>|*k>9Dmv zrr#g7pR=Ov{!*|z4Ko2v-Hc|17<6~*`oGfp=jA#xgLsO*sSK_T=}#v8`9}6nfGhb% z8hJ!SSIIVaL>s%Qz$f8LDew{PTcF%j^)h8^TG))E4#GwF5u8P!PoE&oRRd(<^XzzeKt_j!RD+?x_*2i-tt+sI8;>1>lZtQ(g4k)c>WMvfxp8S zBA$yeyBIbdv97%q+;9UhtJC+n-9Iaac3FXOn1yL|PB-0#HQ*$CA>Q|+svdL&q);)q z--cRW?)vYahD+kYoGKD$4s@5V$tpMDx(6Y1@G1Nqz8C3yL$13bk6RWJNlVb23q$h% zMh9rlwt8KO#D>Z%BRUd~UH+HhJNOEY%XCx>ZfR19-G(wsO%K8k!YA-I_#2#(mmU}| z#HgXcx79JUnTr{L(<0rzEbmvfk=?RtQwId5v&YFjOYqw#RJIX zy658$iFWx5h2K$tDeL@>Sw?hyNPp^ZTG87DQEyds2`{Ur-xaP)%BwsK>vvS7?VpH> zu5J)P1D&OYu_3Oz{HNh8dznHj@8jOvhuxFMAGfUjy83oD&(9_4-zlR`46`+2tzUthQ&|5g>CXk-9dh~5t_|@A z0znJz`}#jI+T##j*uPx)nHb%?=IcKw+U2hlJ(e~`(ejN!i)4llYV_!hNXJ`Y04~Ef z7CyDDc_u(>CUs4HDYVEn#Cdw&0^h8NvNKR+H0lZOh1{^=eNMn9ioGWct0Y<|G9z@1 zv4wZxL{swT6UF{M>K=_KDH=%E{HDs9i)sBQ;dA&LPAl^ilg0xna200ZyiDIh7~ovA zt)IhLWxuQHA1Y~ZbVOdMZRn^iSAehK6AKgjniMUm@e4(8P2qJdCha>87Zo^88aNJy z+tWxSv(Qw-{Ns@Sarj)PZ%T?9r>CZW9}!gI8v#si1n~#pf-c{OtUGpKa6=6N5#;Ke zPh~a6boxKD%wyROV7TG*Qmg9uUy1b}Rrq`grscXKfa%MDA5c2#9lQ&;p*FIJpE&O8%d~abaBz`z5s(X~W)O#x`C3T*bIg#9^Tw7+W_@rHh&cKH)>k zx(5uDQr?jN0r(vL0aswo(D^BL@KL_8#H*mu4E*%5i&5O*lT}{6&pybUD zaLJ~#yIX9~z#8ioHFVWQ$*1q&GR(=vwk&!n**^n?l?#EmJqni;J{^&j$tA@QVP9`i zRt)tb0bU-5@8EmgU8_dYQ?X9Ko}uP1!^_a0S@;~jhl?;P{VN+%84CPbazm5XJN`Rzhz@zD=7=uS4uk#Eq0Ul`2@a$PYiH0 zHv<@L+|gLM=Jsa<&cSzZ1rBRSBqk)92KyT>HIoT=8|(ibE^ib*H-nzA{^8HFB4q!D z{(lDF!ez-nFTVW1j!;#I?XP0!wWj3x=URW&PEp@*!!pA{i6wAC`g30PCo3M_zOI6i zgjjX`nNj%vh3@EVrq?nt3&{rMnx;J*q{Aa{Se*}JW@~D}%Vw-z1NpI&FbCxv4!{YK zi03+bWkY?dtFdt$=HLvR(~kj6sB>Xf`Qe^Xb(GC|)J3>?LbLp&1AIu>24k`^L)O2f zmmgK9NP50fh65Tct_iE8XBt;E1h29f>(k#)F*##!K!J1FG^S^ml~^Jv6Jc`+5=mCP}fOPKV9)k1M$xrmt(b zQ{AE>)nP`RQd6P-v+6t=F`k{=IGoxNN+0Slt4^6p1rDffZQAAwD+YGh6@XL^axsn{ z2>lt?Xl%re6%l0A%6Qmq5~!MF$JvdE+AlspvrGo|U|DMb(6O~R&pGmY=p*-0lI9+g6j%JF16 zy4ba=BQ&xcCe&wFud_O(0&nE^GJ7(Duw1K{Q9+{?dop38eLXw$6_aFHK}4)~WBm;k zHzrJ1BHp7Y$;?Q@C!kiRGc8z?zK(d!JVul=(JbnD@PW6u;Rvqt9;cvpsT zm?~Dy;-?~)RJL528>|X+s)iL^HQMR;iZCfCG9J-S)@pRdRGqKe8+r`T*CjmM;CRfT z=BkWD^nmG%CsIQxyDF%rBh(xdyg(E#b9G<#bs{2*n<zO1o zMHzW+Fym2q&&=-hi{vjJ)8`n(X>N~2_B{-ExtNdlbV9GL$)mDvi)B-{j(t~n0^&MX zDKajXEbok&&azaxV^LBb4rxZ=U&Wq`TVPi7=>t31qR8A^7~E8LpSdzN* zrBamvtVz6GDQV!u?&PePcq zveMmUvt9#pV}mV0WYvKrDvBn`T4BX%uYDtKD9b{&jnksc(0&&4AEjbQf^2AAF-Nt+ z`j~`~Tdd(%sgkZW=>10m7(6$c1PtZ5WDc;aoPZyVD2eszstyOXNx7)@HH)4w6OyM4 z@T#)w004jhNkl6+CkdsPEwsOtQ5 zbX3rx>=dP&4q{$5dY@h~P%NAB_H?sGZ6V2o646UdFPvEb(IwxIM3{m2vM{oRdP%0` zwk;IQC7xW@$#|c_C8w<89tpsRfWj(xZhP}Hgi-kHYe8;hkyOy(9Q><4q*O6<%eY>1*=zC`EJwwWyna2wS zt=bVNqcyj-kQJ89dqqrM+Pd?8_$zWFb#?1>%pdeO!&u<)F8PQnRU4&K4^$#*rUusf{vEl zM&*Jv1DNky($}Sk1l-YZNt7w#;%f*hQo|(Z-q>hf@KPqNxLoOYoVD3t(dmX`aEGdN@UZyQ{y0|z- zJkVB2b3^~vgx5bJ5k}lIFq;>~qiy5_n_t#7=McB%)X{(pT!Mb3PM%|*8xB@pU)3cn~(3huPDr|S`bw(E+>7uGCS>- z`f1)8Y4uPY$<0Vx$OP|W{mbenYMPhnTLrUaP?wpV;kC5Jb)DX>tuWVCtqEtcZPxR( zc($7@A$=^#hDWvz=}=Sl9K&_lkbhV2v!E4vhAuZV5^0_lwSX+g0bbO9#AT~$!J=ED zu8me#*eJOYDa;9IgtCR%{j5R~#;;T*+%H=-gGug6ugDP)VKZ~%)s&ubA` z*${+`!REJz&20zFMZ(zV7K6x&QAYOU8(KPl>nId_r?%RmF)C7pt{**}`_P7TR^yVdnC8=h$C2!8ub z^zp933%D;H>-AzCC9*qZyT~s|X1$JJ5?+fmzG1c^5)omKQWNlF@55abPD*Mk>=%QT zo*m$l4$un+6R605C?2;}o$4hKCoCgvcnMF1#oo93Qb(r0DKow<@|p+m8ah2Sl)W+Dx1lv~yoS4?5m_!yzzp3+N`h5*>T>`ql3yClXj^X6JAz7*;<9?Um*e@D0ts4D ztQ5^_v5RF{&t>6qEhPP4k~bfy9@3E098vWm{6SZ^O0Qy&^Sy`!T9Q!9*aJ(&n{8Xj zF3&>#VSirhQWny}o=5xAQ4jQc==Ed|-a*UK@mM7`5`G;i>mzs>gPiZ-Av`qs$4x1x z%>QP0t`$+n{|bLN>33EH0mP#Is>SqO(U$%SKgTfdKs*o8NmeRS-bGW$Iy@Kg;Wg*{ zt*NnSrq~PM>E-kmOCp~9D~KO^s_oARK`;u}ebd{vfXyDl@9AsR3mu{zstGUQD*Wi2(Wx;jO8Fzj`|iLzak_gx5b5_L*Z(ms zK7BlV_9Sy!!k;1@Y{u_TsS&Pb7x~Q68S5hBx;C`_G@~;CGd9E8u^LLb82jIWpG6zp zM=o+y2#}-2s;S%Xwzxz8@bz~k|0(qXc^%lTfAV%W;Yau@HL?5i6V`?uWx&4qqi%8k zA4L`^7wP$^Ws!#Ju$jEwW7(fO5yOm$XJ}H1i!i-jvFpJ*xDNk1pV1zdw)oHn*; zRx+k&s@B?)Vy{)vw*Cyi!=o6Z?iFot#qw8C?2XLNU0Kgt4h0b^P&W**Fr5R{Eqwp3 z1r$kBl8pVUSmVJ!`=9jZCj11yMf8HT9>y_`)#;^+j_wXu z#asR?Jsg$B3H8!={wiKR1+F)wKTjf4*OL!wD1hqg@OUXa4WHeGpW!FC>rf2dTsWzg z!hZ3ewwRCqkO6-lFFz^HXSSdG>QloncW@M;2Hc5|AxPY^goZrIP+L>1MF)6{|@{JKf?W3e>51>lt7O@ z1mFL4TmO$iy6*O~s+Gsf`?tj1{VMznS400>w*C<-*G5Sc9iGtNUtRt^Wgg>s>pdy+ zf0N^RCerg?wvrxqrE%V&;Hmdj0eb!k|0(y~4;5dxYuiBL(k#M8HRdH;gMY%m;r0+H zYs~z3+lJ@;*WoSP)9!Dx{z!dB$uSZkUBL&00snmnP;p%^A5nAot**}fJ(14;4)=%F zzh*bzta$AnD3kw2r15F}FSf2fM9|mkc)jjDJc2*rU+|jgiqlb{7VLKvt|=M ztYp6&zrSXjwZDjTer-tpG4>S6zrJ-chxPpl{^g|KVS{5vC+f{H2-gy#@EytjH+by& zvtI0-cg=07NcU9U_up`92-G@?$QAXMM>|$DUo9$j{!i`yu>OP_@G@d_{=FA(eCYqC z{;Vix*faal*ST6%+U_p=2>%@N{vrRU_g4+Yr>P>-8|lxF@aInUC#tq_ed_7uQ|9xU z1z-jH)3%gH6cBorvwH@2;ZL34#i8{_$vLyTSBrcDb#4J3i1hs~ym0BK&jQ@im%R}n zXG1x-HNp9&%ge(dNc2^>r5OFW5H0FM-`s=2(jyx$|Z1y{F) zgYF9%FO_JH3Ab)skX&!!K3s=uaC=)g@wEPyQmiLzFfj@E4P1v`;ksX+;{SfEe@S`D zBf=zU%YZ+DtMCWh+!oIInI#F&8lWXQK(FAg$$xnLi*U<~_KN1}N|rQ#C@pv}#7xwJ z`_NZI#R-iTR#hc^AvpHyaDQ9hAX&Tn!X0|1806!2~?j?~kc( zcU3B!-hWtsI@NW-7( zzWy;d4(Ei39fjwY>Xb|R*V*>riZBh1!(o*a`Z~amMaZz{t$UlQQ5$eou{CJ5UtZ!>+r>!3n-pE0B4t0{tBQr}Z=&*$r!^dGhA!>VXWntMK<8|iw($}da%q+}ZlmUW?Qr@i~FR+oAyCh32$kil$H%4=T2z$qh;8O z>w7+5G8KqoOk0VBy+t^#YUWsR{R{il-$0%kQ9Y}vR=eeWQu@s(0oUeeNv)MNq@>RfD8NN^Tdj2lfuqi z)%jW4&-pE>AU>t@(-NQLzK-CiTz_4MKV`9ER%mbePZ}~v|3mesq(Sp=;BEc*@_x?m zn5w}elK+~{@5(N&Cq7;Ys#4mhYw!KuTR-I@ukzled+3u}De!}TzhvFG-H-V3Sb`5_ zeDLqb9K?q`?DE5KwLe${RRsYb{Ji&{v_EeBd9nZKV?HrU)Bn&V_~60EC|`(xkJg{) zB<+6$sd~wZQ6IVfyqFCB;7_RV$2?yu9{!KfpJ)a6`1g-4yw*o7A$njx^8AMV>ik#q zFM5DK=J(6iknwTrFKzwvl3|%;R)kwX^rz9v9 z5AWAwE^586+D! zeWu`$7}Fwg*Mq<_;&(X0#I|BZM?cKl4}@ElZ;H5G{7DZ%`Dcl{p` zlIWcFKdDf9{&@^2@1j5P{?o=%7HR5NyJu-8bp8(L@I>dtYiaRrHh5I_=a6+swAJJO zJUA+LU;n&nBvGVIZQf7!B)uOKm~kqTHB?}Er)lO_ySsrAm5Gk3MmbP{iCT!%+H>A)nMfltKy?hYi?$*#j+aBrtN z^9Y=P3vdBWi#V;PE`?X{5bnS&c)Hb$l;AL&7wP)EiX?sOar+DIz>}T!U>44bbM=JC zBLZ0BCESLa@L;DiI0DDvf~xG3#{1RNh}?Cpe;ZeP2+qL;I4SRw%YZDvW4H~s;qgv7 zFeBo}OK?i0?S6Wm!tHGt%U%ATh;(~eL0H=VrxO0{ZAt4(BAxzJ*y(xTnW{*93vTYB zKhtm$&cRvH`^MNow0ZuyQ>K( z;*^xHxzqZS{Ljb$MUnUw+=shx2OjLS{sx?o{Wzq#ujE6L{-hSEdt19~lK#v~|Ido^ zw=YbecT(Ql+aCT{|2g;^F2NB+FEO8XPy6?1r}fw2gwD@N$*-(Nr_?5Y74Gk(f2qUe z44hD%ysnP1_wWpE!8LfWi~gU6%kY`(Url{$&vpLywm+BPGMq8`jJCSuQkT#@*Z(}E ze;mGq@8DArpXK@+nMN$;)w_3ls|}ili=xfE2(y-B?kb3w(~-WtHI8qlvPbPv8fmEsYkX6}4)|sk*gQhekx) zmfru6?`nz z_|2!}150WXJ)%*{Yg=tdL-N0(0A|8a|9Wx^seSnmcr+ycWAG__A=>J3o&I&@A5N?F z?a!@ko`{a`ThaDT8{k7l>p!8$_QqBlQW5X>_iz~w+Y-8Jbb6`|>hjR~fqAjEUV+nk zKJqt*BrtTC~emRr{+t|EI*0bZw|l z9?=ee3!lPCm{RgI=8T@J_c7brCX6U?{|0{(bzjW;rbeqnx|xi^t*xDsH8>4l!dD_s z8Z(@V7QE86ehh9n;d$7f%kYIr?+;1-U2Va8IHo_`{`_7Pb>by4kE0Qk<7&O!+x~n5 zpTbdHA393Gohq(tBL>}Y0EgiU{15mRj*B`e4sddUV?lBPZMfrmFb3z~@9=l{MAT0) zC!TCz3_+G;*f;M|WjUZ9;1lanY#IdRtnRhBN(#?idY_0j@szAgW}_?PhQ5Lr-!~7? zD4d0F;Rg}N)n$OYFbW-Yy^gB!^@is{$|7$2@8ZpF7-C0Baqy&#a&~ipe*ZOm(uh^#-8p}cijf(a1y?U|9~sfo;W}Q z0jZ8!Owo25d{C@&e}j{X-3F#VlWH9rmp#1YK6qTLk>82-b=1lrX$f9ZYN~Z=%Fnj@jhc86?9HaVa>bEZ}fobjEU6=o;SVzAXab->W zS5+i5Ei8)2i*Bwzg}l~NuToW}?NcRUxE zqPK75Jr0S#C+eJab-#>C9ow!yaen^+pNI;sYQ0LCCPzlCu(!GXIk*IW6Kmy^N;fe# z*;7bYQbS+G*!8ULt{ow*WisYtvfgd|{wvr2&lJ8MHrmjdQ?}7P?+(C%mCWIjZ0hwj>8rB4ld~V zs_8!pBPt3YQrG3C{>;Kf#hyo`{RRuV}<< zLe;y>cCn<4c63cY5B;BqPn5fuwf2FMMS{5zN35ykdt?1Ga27s?3yQwO>Ml%d>$;uTm(6&)fIt*Vc_J{$wjwN3*yU-+MS`+DfKY7Q?^8E+w#AJr?)Rj zA#C@O{LjOe@VUHy#lXTcwaN7jdZ;ag@O5;3`3Am*lNM$*1(#Dd4B%X&L+@h!C*XVd zL3gKy5ta;e56ETxYC1r#;BAusI9!D9;amNFrKqi${b$x|BCJ=<}*{G1Khyo3KBZlsr9Z zSXCv{zfAF9LznwxKbv7cQviHg(OcZTBM;G)4=ih-{|&5W6_|t5%1`C$u#PE-p22Td z6$GBvAU4=^>a;aYbtSQijAl(UTu0y(93LXxj<$&?oP`f*8j?>PW>hAhhEmzqAF4Mn zpq`$cffUD_g7dm&gQkOz%lgJ~iw@(s@TR=oILs?e1)6R>t-SwGW0k9MU7(8bd?}Uf zPbg_y+X67(vn@y+e)AgT8;$kPYc4WZ(r`^El&KQYQmb@B&z}s^`3B4diMgG%tY|~Z znxQ%XC*d^ALRN-@a8`NmvYC#k;I-}ar8b*WzWxc5N5$Ozpa@i)ZZP9;1kQ@|d_(?+ zbo4Xp*lQZzeqU&>J&Eun%tE;i^D11<7TsmlU@LQ-{uww0=Ujg#;D~tWN9Cew)&tx( z{LZpS?~l3ur}C|bzj5-qM(NgszOLfO(p@Q~hmPsLM{G7GQe|D!#gt1uB~!nlKSwNW zlogXNC;-aDczzttsu;0x9*)6T6<#ZXY^z!56Nf5Ivz}3`oTu}9($c2AV(g#WZMr5X zg44eKag%>T3+@y_>ti8Rn34QvLjDKgn7n^(7?%-v*K~NtMLM6bNJ?eN^SCm9W$9em zlqZ(~Pidz@Hsa}l1DyD_G#=z#`ShAma z+S56emOSL_Sr(wDWmGhZm`2Gqfr_JT1&cH@3;+5uh9fX1hw&78qap#1yMtdmdd7B; z%ZVL`Hj5g~DK)KPtbtL0=nF&)o$3jZ)Q`bJNv)UBwox)t%C;q{OOis$Ma)WMy%AwO z*X1iRQ@RLz&=S($Px<6pgXRGtlT?Z!M@IrfAl?HR>}fR#E@Ut%V1`jSu(XQY;{Coh zqM?*m^oee+Kl)C4=C?eHhnc~&{(7CYX>0oJ334pcC4ZpbY-n|9|I;hK%dB-V2K)QP zOUCHY6|?U=c{jx@h-y4=?&^?@Y+8tXGs-sqcL5q;r6 zesf^9bwuZF1OXM#$a~~PQRi|Ref@r%zgdAv4U$cpRmW|sNV3#7p_JqsCsa_a)#)B6 z8pN)Cr(!7cJtJM4F<~vG^E+qy5R+*=+1;{X7AH4yKy6CpWPc8*x}7qA_2LbpWYx>` zFH^#1uhnRd8&Gr2N*pDNCrbt+mC1yTz+zdg>vIN9)iu<|foViI#tB^%HUgkoWuB@^ zpOlMYk0VrpF_@Rbe3ZuxIecWUpYBR^2bNALS$eq|ap8z19mDa&m>T971_CyYg6eu4 z;D*LNym_c^#kRh*Vnq4eN`>aAR5yh@jUqu`$$ZE_D<4-iWP?m&R1{~GBHMA>AQWqI z3?+?YC!5`?=)%@kS)*=KCKz2zyqR$kUkAMyth?3UpPQ ztK6DAi<2=M%pkg|B=$q6iPJ^Jmi!NSQzk% z8IXakJl?COJ??sLG15kHT)M?bAnFO}sFd>}xOQd^pl@~~BB-*?&_r5)Rn^szV%nl) zv_oCng1&sfj56fwHSI^kY;Ej$&nSQf79nj+s#RGZkHY}D1{T{kY1}mo;fO<1o_ixX zKUO45E!(X#vy57%{9kU9daYV<=5Sqy%5#~%|9#tnydsq1M)ArN!B7~Yijq$iH56>h ze_HxeHm4irGFSTBW%7l3hZ|db_D)aC4e zT1_TrQ~o2CdF`3P4=f|uQ&Mxx8Vb5K3&5GRWlrlVF!veTkUeYOPL0kKasbSgT z@EEKMBeStfx!$#7jyrZQQV1o};mzE5+$*Yrj>gjHo{lJ4aFu>0%Z!aI{_l#R_Qx$c z&QRtdO3C)OCo>+lwj(I`hElJ6vk)6vU6$W!nV~G1ctqsc|j~ zT?&b~RMHW0*M;Su2Kt+Q+l!1MG`@FTbTz#@Fdm8MuG+XOy8g#@^vr2S1e)t~cio?6 zmUFCFy1A!BZmwyPX7NOOk!RDa_w-D~97x%YLswCDL4U$r$LrJ-$~G+?i_OQ^AJJsp zRJg2n#!gOO#;#)#&c6!jwiRAR3@i!|WydH(D zMI?M;Q+ve&mdm}DkhQDZ)I!H9(77VEs-5D9dGoxhG3_M_qO#)o%Jl`+EoPt)sfNGn znkDH;V=+71RaUc$C0xQM^okXbis?|a0VW^R?Vv&1D!;doVGpdCs-*>x`$wyvv}3d5QyI9gSPf5Qj&FafBa)YSOkxtcSZYQ#BgQ7Tc^XGjP36>MfK}YgzZkeqrtCb8u zHN8*UULllks2iXo`EP7t->$i3^PXYDBL!78Be{`O8Wfiu8RIxVWeKHXK6q39ak~ls zgf*&$YIXGOZ4n}_%0q0ZYTLTy%7)`GFzXpFyIGRy8JYD!S?Y>8;LLP6vJr91*+iOg zqgyHRl@83740KUnar+b3tdj0)Wo=|uMDUN=O4`3}-_C|dc0-Y1{7WQJT0s}qF=eh< zy*D<$Z(c2=xLMxxTIW#pY+&9qB-%BT)y><= zG9r!KTxrutpF7B!tXSV>oM%r)hN#zsp1KsI?QdYMctoRrqN8h5Ny@k`lInE~Cu7~> zmA1kp^x9z((ig3%jkS%SMxsnzQTWr50!KSX&yW`vVL3>vmtf5jz2dbzP=|r*R$0v2kH5uddjA3co5ZX;yN9A z+Ojsz4VZ^F;%KO<(Qr*$kXx`ELp9Nc+QoVbTg9qz?0=X9R2E$bp8G)hShqWZyX)`- zWrdjUjpTe|!fr7!kjt~PC5CeI+7VCrVsi3LJsd7NTG(<``pU&HsHsluR$DkTamTCJ z;=3weMhwsu`t|#?{hpb+^&=>L1Qw9F~17O`ript01!RD3ul6SWi8Wi%Cpt>M2`- zam#aLCdEjAw$;P+JmK=Ji%a0Vum^gUWb<2KRD<6OIDR>u^7rr<9%+C$F0y4i_sD>B zMN0DA*Z&%xX!G32-nb*v?HT@H9L-2mPT zOpSz5M-w=&oV0x)D!->92?@>hbV!_tzToTM#vs+YdZO3VgibO4;whA#&fCjmGFm!J z?_kafP)+k*arg%sqIo5GZR-Cs%`Xo5%BX#ZMQEA>+mQcj^>o%OT=VcBi{W~(s3mNi z-$!sxb!i762on~%(A5OM%h3N-;dHHOg259faea-DHhNe$=ljGX|a9|~wz^p|_C? zeL2KDvLc+ES*ROPQ`Kre19RU?@E9H@G>ksHf;(!f9J4Y;ZV`MYdlbTc9@FISMa6Vj zObr7Ig-gXyW@RgLN+`rB^pNeJ9q^51>+WtM1AdCPo`&C^Fz1c znH$h&@K7RXNMSlEcGgwfXu=zKAVj3i^}m4of*{Idu7_Hf!T~J7Gk6xyzY~+lAr;sn zCGTy71I2z%x9ZO`c&hc6?E!Rp%liFfuphwVK*U-V8Ol?2Lqxop*+f`RCjA{efhS3S z2EzHdEhNL3V(RDzOWKPx{_sGwI~(g?5J=21DX({qV1gJ3Sg&BflKk(g4}Qk7-`!$Z z+tc~E7y6IbU|072Jmh>O?OW)9qdRimulGNDgyD{*U3V9N1WK@g?h0=)#gIOJV)@ zMHE*p#(lY6$a}^deK#>5mC?;-LY+;GV&N=C>65R$Ez%j=E{MA2t~;?sczxC1}IFF`xpvL5;9#fXe;6w$iy4(`IQ z@SC&Vg^Qce7<0cE^7QQH`w)JCAK`k)A6+&^ia@+QOYHaU`g8++66yH>bsW~!OB7%S zQQJXjmlvXK{tfPg1B;5wsAg5`)=ghGP~7+M0RGhT3%>qb?IdxF&x`4Rlze(4-ur6- zMbNe(r|9F3@_E$pR*iK1=b`my8t|Ahmb{)7x{?`6@FjnWw0u*4y1ICcsHrMKr%0qU zb@#X>+SsR||JWU&R@{QHo~sHqei!ZVs>{EjUZTi3l!{rHo+jMr`OV-U*TSy->%v;rc8Mv7aZy6WV)ZipYW6G|3GPu zvUTK?i;+yE=H9?{_@~pJxAgnt#giW@u&$!XJGcWsi?qER`q$U_9WRbSdsTE@q55AU zjei~b-&N=`rDjquVwYu6x;nrA4*OFM1BR8S^DP^g9PK^$8U8h7Jx~tM+XFmro04oP z@c9G&1%JR3uU@T747t@_u)mRf=R^2iq}!VcFtAfrwK}$UCv}C!`4jweNO~A)J5Vts zdUTA|lzI6R{u6!~B5iJ|4po&7M%tqzdz6wlx8R@fPsh{iTLLiV1}i$79WCL#|7qaQF@4p!v7euH1&`VeWnH&Nu~{UZ8kE6~0Q|Ayb-iLbwD&Eav=8d&nBuS~@? zJHM@v|C&@~=;Dholiui-{TuuYH-_{l)?ZWHiH(r9ZT)}2zqi_-mR=!>&>{S&fu+04 zHTa+KpK#0V&$?oKZ>Jl`0kt%GoviA zIYrg|o$SxQ!v2)w3sz+)0{EaO9Zord+wd#=2!Cyb^6RUpv!afUb;*52j_ana{$W^; z_^d=>U3Ran-(S@=@fY}?@Dtn~LgjAj`Hsy1hxJ?3`u`386aLsrn%!4dNYkG0*pEMj zyYMIcIOM+FhlKFD65FC^4pW}{rmX+Wb>XZRPlGEB5(wTSV^_JL~Ie+d)VFIoyM5@C*C|w}udPWBn~_e+cD$0gvE@?oL;? zLP7SV`Rmf6mQ>)qm?R#^dj1Uex1xzIYYW=azoztWL0HJ?{kLELvL24bx8%K2k>)o1 zsn~s+^{=X(D~?dB$YgOBewX!sxRq66Rqxy~U5u`W0(OrB=7y7Lr9QVCvnIAY0E1w zDf;Vf$`rPiXHruYozg*@qU-(z8;G7`PojKB$;3ePQh^%^>YD-K82fb6Q1mZ zhd-+IpN1na2V*dRC3po7;0D~;YJK8U7N)~FIH}prp2ELp@B|*fi(NQX4#Fuxf*;V$ zInyfA67KHAhcE^w;5?j#qhic%%e_9b^$)gveniio5_0jFiVn#K+!G%Ej_aQkB>YLu zR;KB!l=r_c-t3iaA0CBca6zChRo$z5N?zTB2RpqyP&1g2IpKpe1O%S)y!YYhPU|m; z^!u2o$LFA;%X3N{--m}gEnyUn!Z|nx#|1my(%t6~+=EBE5E4~j9!}XS)Reb=r^0R0 z|K?6#F(*og(}wI2h4Y81-fS=bnlL5K>F_n64+|=?JXH31`%fK?z-i+XNHyU_V_d%v zFL%P@&)~2~>rV6TEnBus(Yl;=bN2=ysr1Zta@r! zE_q4vOY-mS^cBheoEE1}YAb1}(<_Cs_g#d;dFtF4m8SIeD^lRM`(br!z@+eNCn2*w z!#6vLZb*#Na6l=&vf|jc`(eZ?s}6NQ3XFpIRO#P){o^pB_ire;TU4v;-s^AZ<9EpgM6kzms>#IxNMF&}<>3sbgcLHaR)l2} zUUSdiAJ_iRh>>eqO`l8q(Vr2RGh~3WN=I+it+|{2mjqKVEdw}E+s$iuzaRNeN<{ z`0}kfRaW+VzM-#}v-7*CPN{dhuRk-`_f(B;DTqzK-`f4hOPVU5R6QwI43y6Q`<4;bC-r=^4ZYh_{$<(ESs^p@)wG)0hSv66|1=yHZvwi9#Sz$-{HIk67!$U^vP{6j zkO)h=d`(S-gSI89_xmX+lU}mXzYq9i#h&(b7ai>Xc~1tmT&+Pn1HGIFHl9h6M zA9w!BMf=g-KcQ}{zpjFA_an|v%=*UWeVG0>ilYqe4}Cs{a56Jt+y1>NT{J#ygUdw$ zt$*kO!$8OGeDDU>?0`2v!obHcbzgh@0R|$s0~T$|AMmLdX?^D-6eKGe=np$!n1eu* zR3G;H|5XErt^Z>VSQx1O$NhfAN~N8TGjKx&`h(YBx4FrW{QZgrSReMh?_c*L^tWCl zZT}eg`|p3S{<(o0NQFLN332{Vfbo$Idhl@pPG$rNy ze%#=so?_4LBlPly&8NLrmT(VCGA=OG8G-XGYZ7u_DVlNJ?I8@OG#+yzQr9SZoai&?}ar+-?22^1GAjHSF1F{K7uZ z@09*`oSgrY8nm8hKIz#m4rLOK!Fjji<+z^iUgQ~rqvEDKqBE4K%i#gsg$M9z zCmWc!n7Fp6n9M%xLjx2{VwZ!S1mt%+Gc&GM`nEE@mELIOv}C zfBR1kN8lWs7aV=f3Ixwo=YF=!`VX1@%&KR-t!ea!a1WmDB>#qh*iXY*(FdR+ZCd|> zZ5h*D{-@!%ikx+|r!VODANz>l*3TTAP@Q-l8q&X)@B*I0(_M=Eb>Rq~5hF-l{Aj6B z>h7NUa}>_NdC^pWN%B9VBJYfh&zkJteYgkDcCq21KS_QKKsAbIa2IaDqn*OYaX6_aj{~wiF`W0z z*1x0tkLcy+Htbi<-^GvhTAgaY!ToLbU>4576}SQyU{>y{tvW;MH2fKEZFPkcMml{= z5!e(|)g=B1uE7*sjS-Qd)gKpe;3p!UtY{Bj!n`ov@51wKeb`|*51+!tq71tV;!o;S z-rc(XQ8)`%;IeporVPP*Nv#)?@aMMcAAvJ)1wOaGnp*^ws`OXkceu0lDpZ2w@F{!> zXGEtkVL@n`6Zi#caAT_tnuK%k1$+u86vT|H(c(EAhk5uDZf?8&2AqZ~a0!m8$TtmW zI{O{mQ>W*>Z9}PfJmHzV{miZM!^Q05$XCR5nm3JKYcF)b`}2I*8XI046eYJBCf1zzNw-q z)zdHsziw-P()s-gzJhbg0Myi__ySJDb@&Ue!PBkxpgNp}&*2K3R{&bFe&00Zm0Ewc zX|yZBG5AdSb4>C_gAgxelp1h%>s5LRE{Qhy1dJQtw~N%HAK!g-jc7_a%i7E!Aezn;bVlup6#}qat1Z7S1Vl8P^%=ij-zr zF;#!4RXK-~@D+RwpTa4{u9?zs6Z*Poed&WCA8-^tll3{N@IAM-@xyRbkyQn*yA5rK z_VYXVMAPCGBN3PjfPW?C3hS@K8TbLdg^O@lrysri0XSYHgT6hq{(1NU{u^9@!%ChF zw2A{pN|H(%H{FL!NPE5#ZRn`gaC*?t9O0D0k|#sf=QR9Hta+1~c#AHBruJ`C-*|6m z{fFRF_y>FoXN)#A1NWuwMbRc<_k9=MQvp=HNIS z(yJ9>h*3B%()lyGyG4uGBuvVl=4vb{_<)M^=kM^jK{(e3N}&l{`Yu>Cm%pTXbZ zYd9hAml^H-36 zP3!*$e6IJ;phCW}+!1vRl{H4tbQz4n8Sx%pfTJ)D7=nFW)E=3QG2KiuD8n(uPRV+e z3~aul&4?kouBIXu{Pd;WzoHJ|vJOT?|68?j{4zA%228p^;*%X)kR7j${fE&Rx3PijV*z3f5AA-Dp6hp%8xn_RIv zT$(_gQ#~b3-xT!c6nrfQdO)YYDHMm7XK>%Rz}!6i*-1X@B*vKTSAo;AUKY|8(<0+@?1DU7YKK^4tmkEn5Y8E&WbC;5K` zKZv%vYKF2TAeLPHele1@X@KY80(@oHw}uR5%$YP`RKLF@*p-b>x;~}o?U(_k5kWb2UE<47#SG?u@?fmCH$cKZhUSb6s9aX1y~V zgUm1+msAh|7=?@Q1AH&ixEbw^jM@_X23#u-=#9AMhQVRE%6Ia<;Y&PpBf) zv3I`yWB@;t0UQ{yOePS$y8LNRhc88X9|FgwwmX8us0y^^ZIXYwK76ao z@0?=S+z@*+4Xx!$ilv%-C-nbY_yI1?1}I%0zlJMf;;kCMf5j9jz)zO7e;Wqm6nrP%`>7(zPgx=?sfB-3Rjt=a{|~?w z_y>Hg`M9bDdOc}TSujKuC6zbyX9O-4^(UU}8U#YF_4hTqyWr>dE8Tr&6aWqs{swYH zXk=HDM;rR{1^fqm1t*mE@7aV-Ut3Vo(O%K|2S7v7iJekzCZ{W9OK~j~p{*^ete^>NcH@<>O5w)+R)3NOGpMX!{KXmt<)M@WSDIKMf zbf^Mja70y_so227a8Uz#qf*zl(b)84h8i%ZD$yKdB{%?QEpMC4u>mi|K_APwC{6T8)Ii^wIiEtpP_IE<_Rc=G_VoP2b#>G2*ApMPla7me`1BTBV zSIEBRw^M=tq!{8yBK$fAr?mb6+UXfgd|zr<7w7D}uRjeaA629ndp$0T-PK(bm{OH- zHa&kpRoht+$py?^d@`M$NjM58;b3swPN{G>uRki814Bn=R1FfTN|;Q=ru;9%S(`48 zzFikYT1A>hKBFQN)0!?lAgtlY4Ge6;q9<6oLqnwV)5_jnghO_KBW{RR>AniJ=i!W# z&L7cq_NacpZ#bcS18vD*8qO$BU&0FCbCTb{5Tm1wI^|Ml;GBvKW1;`Yl(x*3aBCNd z50OUJ#Cd&CxAaZ@IimDv)sh!IMZhC4sxinTx+s&@e+teh_K(?=o>|Xa1~`-@ECrKN zi9^cVbj)?c{*`r!n%}BF>5qDZl`b#Y-Mwwp6fK3%Bg%pv zu`2VX{W&Qb;d&9|I1<4%&A_H<_){Uj2F$9&HfdO0-pSdQOK#{0oeKS%ll`34!xf9+ zx~2>15`IoCI-_xZj|<&lLYkj#oZo!Ycf>`au0F|$z28R2F&OH4-YzLq|Ymebbih*>2Yo|6JS~AaZIH1lc^(T6b?%Idf$zA zF3A}7bTGS6hH0e~A2evr3D{!VV5a(pc{s2jpe~a3lw@8(4BnBP?kn)kEe#%_U^+wd zsuI71!I+V#Q-s)^R3&DhuTJuxQlDO}PV0b5#&x}I6u5fYxln=Bqp!N{b7nrISEMRQ-5Zl|i(jv?9;ZJ> ziw9b8%pvsTJw^nlau7z+4SLcHa!qF@Tt{Zm)0xcZqylAt5j72u>4a1a>nB(CTP9&c ze-0&bh8mq|)t5$%(IAp$q4y-`*Am%P>H`8}pA zd&45w2>m>gN*T4o37);q`m7!vwW3d7`(Ltqe6qoldf%}m|MX?^D&2&;iAXN|VNZtW zxc2KQ^kY9J%ml{8yJv1ZR4YtKy-xFh)?d?U%M6^mZFrrT62Zr1KUYgIrqSea9U(UJ z&-rE$SF&BLE4a*|Q&)|rq3~&7+UEx-g1Aapn_S7sKsYSfuXTzqFpvS5&|P#BWj3lS z16!wmSII1CLqG-<=u{15wrhSPZmwm!4x%75%^K9IG{+P;N4!7K&xQ>g7#3w6CZa(o zLhDemjslO!#Ktj*8^}R9&?>Y>tmV6_NU^V@TT#xWFI3swNVfADtx1)}ss^+hetTeM zIFk*Umc!Z%jYceX=}KBvlYiL^_@ui0YtRgNG*qC%JZ9SYs#(v9@sw9;G{>#wzN_tk-LQctn<-%fLfCOv|N82@-QS7v*fvK=wz%v*QIFE>K+s+6G~>)`-X*+OhzmId(1YxWLc=5 zz9QnukXp4O`_nLd{78Owbcrk5UNp4+Itu6;c4dfHC_#yQll%p%0aWao(W%!Z6IywOHNS$$Th;h z8dwHd@o<)H2?N{0IQ_^5Z|G$%Z#b?S$iPU;bNixjsotGdhHm_K)gp$va@FM``AJ{k z__Ynva6>gHS(Gw#+6T%4Mj|ec^INmf4O78P-xYs5E<_`WVe+*85uL}1PHtNm=$Ubl zx%V4OW-qrVL;p&qaFIwVnTM}NcTq)JRWXI>$&z=qgdy$9yf#!*G*h+1>j$wTrOu20 zS#|yC88u68b&t$8KZ1d%4cSiU#fo99uF;WIE!o*<$DfzX`h>sjTlOxjYR}C0hN+AK zQN5Ug8JI~a>FeSy6=!qvp!knh6;regH9st|AF{&3;n{l9wUXT^(LTp=`hB}+MX($; zAR=fKM7N8zk-`z_S!Y;!k(Nb+eghjAKL3Wg_lq_5NEJ;OuC z&ZV3FW{lOB-HfVxxoCjnFUM!Y$wqx8l2op6HS4H~Ua)e|;q;9GzHi&xw>8I)cZ@)) z@A~7Ov2IfE?~3p!0=5mM9$2j_;--@RDh52iuKf=A^*!C+wzetXf^Y!nejBG?_<~Y0 zbLHL{L)g|K_iWp9Q~QCmFsjs>xBIq8y1RDr zTgqkCtUT2=`qP34JXsLpaI;yl4c-Vf2WC=xaxU?bnGAbYoXPSu*QVGzrp0>IeDj2a zRcopTa5WjMwz~rTOJqwh)GSw(^#t(IG1BUxkgIHlu_x6Ts3x|OY+J|LQhTOJnVFH6 z!uNR97OVtmd&kyavWKIS&@UvJLGdpK>GOhk?*n|NuM;yc);_fOC0I)9j~7dwWfi*t zMKqZkk*L4_ddR;i&8(@-ylq@ZrA=s$jkf%Lymd=R;bGY_i`wHiz2&-R4Mv-MGD)p! z+V%*-DGD!b>Z*9M+t~hf%mG9SrELcz65CyMvUa;-Q(Cv=PXt>Xi+93LvZ_>1$m)w3W9*fh_)WaM4zY0sD zRu~n+WL7jRrQ$2}tg}LrEM9-ZE0l_B=@&)T?g&hFRSED?(R8^u((`L-;aNwAWJT@D z1Jn7ky?+EieVMBD(0@M+15=)|huNlxfy}W4?Ur#q&luH`=Ypdi(9_{t7C*@Ff7gm} zL;R*+?$hrth@*M~DPl?Y!pt5n>t@@wj><^0Zs`BIls_{K8`yth-m5KSnWb2N+X9d% zeJKpep%|Gq)xh>T`bgJ=ld>v+`B?pE(TRl`FRBf#nM`Tw$(&Z~om(em_?>W0(L=S0 z;P_t)XQZ*E>9H-{uBtb(DGG&m=`XJtiS?*epyN`QSyyRNxL5EP-avC*r0p*>`|bd$ zKvln{t7C0!aeAT$bT!FoRfzHj)oA8jBypXN$YM$IT2A}4Ca{HM!y0yIbJYfNDL~5< za2eWZ{m)GPW#ttz^9p6Vmo|kfv$6j0rY%)cBL9G0)*>cMgS{_cG1mVMUc-tYvN222 zwVH5EgRu*OrrOY-SFi|&?Eple5e3=4toO3yzmDYjM*1_l#RgQ0SIoY+Cmtoz?~wnJ z9vE0OUbXN&f|;hEjh3L*R!4R+V0CN1kI1%P41<>8Wk7%TgtYJi4q4hKCIHaCL>#+r zJ%j@UC7ya}r}R?;GXt4aplbV_MA&Q@&eoVAZ1$C~jAq)dTyHvjuYLWm;0>%uf3Wa1 z+<0W@*8~leM7{}yzM$M3QWdW%qW*$K?cW<%5B+~92U;>IMiOISTr%&4i?xP&vfl`& zY|_YD!-r<3QC&@-yn>}9|8;mLsG)}1IJ+i=vMmH5JG_BK*Z(CrB;g}bMM9-(kOuGJ zdGzph4T}4Sj7JEpOul*akya!Hd>NB;%c5gDs+v^Q3;@O3Nmt~fPv9-|dmhbEt}<<7|49yGLw+mp5}v9TC#pHQV%Jau#PIw8p2Q%{nh2#% zsu3yDE}11TBlEFfq~ZOp2K>*cjk0bc@C0~EoA&}9I%)VaynttLLdjUKqxTezW%`yk z!u44U`6rRQQWu$D_#^EXcdA!#6K*Bui_o8FLyetlONwPHF|O;!UdQ_1OMi|VVQQrF zGSlBKtiWS2@4SxnzmiH$=>6l8-ZAUnm;4{YQ&>prUr+?PZgEe)m`IBIdpd0Q91&?z zAYG}gaomE?l0|P-k$=3`{%^|vIoyY{YV0eS=KJR5#(nrS^}+YyE!=}sYSD}$tY1HR z(*H$x0Z-u75b5~?m{)PkqpyyIziCrn3DoF)oS$cKTb+$Ht516VrlTdi7AVi_lmdD$ zgsJ07kA>6g%8W*KWL@Mpo9Fkv^!bE(%Oa6lvXp#Z-v5pEf7AZFgohd+kM@^=U8182 zBRQala4!Md_2G@=aY%LLxap*FXGEfXv!5>_fO{t>Z^W8>;I>)D+<#3WDFjHIX)NC_r2Nq2nseRqP_` znVU5cFa9Rl(~b4_q~B4EE@{Cr%H0#J=3UXoK8p3{HY*+NKxLc4=_vMjB1Decp#WV~ z^$(~St7gURC_naP{cgZ7@Mr3wLw%b4??)CSm-iht;bFxpWPP5A z`0{!B%ZKq1s`WpO8>j!CeL%medJW{q$Izq|!a6`nKPtwM9i|a4j&&JEA zs9+r)iFou@++bt>#-cY$H^|V%u)dGrFZgj=`H!l)=pY(ByZkqG+WrvldMmyEh!}Q8 z?Bdihz0Bl&p1|)SonQ0yH*8|QYE|QoX>l9giZuSuIKRB zckM^wj@+<6T>+PjSOLUUxTn$C7QBY5@QcFXC1>{=Wi^Aze-D zBB_ZppPBuL+Tt?YgCG6;y8cg?18N$HZ(BS1M)u=&=zm-B@wnOv^UZ#NGXJZzHw6mU z;V1YN?mDJmQ@S!D6q9l>c5aI~^RAdHZ-fEqDu@wscZCO+wCCS&Ga{(I0+dW@SS{v2J5s%8 za6{hnX`~9H*c{EZ^&;U&S80zI!Xx`-h&HxqGa8xM4fHQ%-5itm{Wn~Nw;}(?c2v!E zMkaiqv_V+EKb$nXZ>R@#YbK?5`A*j7Cj48Z<(u+fw>6h-3E15;8Ndhd3;gT`q+{!E z7*$bb3E-Zv8-9iVgg+b)Yha}1l6Q0(Tq} zv#HBUMO}@#-Q)VgbWIoKzr?(|80(J^)34l7%xDWs`3>BX{yz=hwPrY4eY8bBKAfFB=*!-4Rgxgd(8py@%py%mNI84diV`~hMVENV)NXFd-%Jq zBGW_o9sW~y&w~8h_WKcRbyP805$Eqs#lH83&{RYIeVLJgl&>$>``CEbuMX+YiiF>m z@YW3rW=)*2f6DqiDz+nS2&rREv#f*Nl=XXFgax^FgBB3)4C?l*bg z!ur>Wv8H#vtqVBgsiLo6x02qsr94rJjWSf|?@Pu0KW;@M?Q6p$;TGLOU3u?UMbh?H zJLx~xgWomE%qMUa{tdruYd^z>M!Bg6U2=X^-surshimX?TUvv4xTl)V0}Vzt)K#@A z9li;7Bl+T=moQL1@02FpbE$DC@J)EMEf()0+)_>Esel9PT3U*|?!rBIxGisV51z$TX+s@xm4giEkctK z6<(KwXMR)J*Wu3^+*6X_sosAA$}&JN;2vC8_J8>ERLAHIT+m=`8Jfm;eG{&4J-x0! zan?>~R3=yEEp7gFxW6r}O$#2V!|55EwC6i|{vO;_0cBYE?kmH3Pgu&+dWB_?5!{mg zbhiHf0^E_PU&3i2L-cg_O8a+vcm3%JHu+$TN=dFM0J;e`hrCyu-;(tAyuqS`QYGa{ z`m>Gvm*B20%df>4p9aHMgEvWMUrr44={238!>_MR_ApxXLi8s5Gij{>lktVy70bNy_$}2S+ z?D(00c~K9HTh;6hyj1f}f0ySQFbjvos5UAG@J5CAce|7NlqJ8z(t%u^BPob^vHSYR z;Gib;Q&(I^JzFpLL)tx}^&f~?L;L*~dy=wMU=|K*{bNf0Ey5dZ;cn}n5=_RNA!shC zZSBQ=^yiR0KO!IUR)i?8ch{d>u+OdnY5$+YD{r3LNq=VT2#vu&$-ignoZD^v4IzKe zTSGxhQNi;)N!@ahuBY=m24%~hr{<&gyF5Q3{W+}j-;qguuJ!Mx|8;}So>o)eirW5? z7Vf4$aej|zHaD4&B^9n;?oP_(`ZK0`#HyXYJ@jW(&(F$!q5Ml~HhQ_c{?r6hGpk&B zs!2zq$n)LxXHry7)2avbwRz9s8T|i>d#~lVvTR#yY?Bt=8-f7ny(i1`IY$xqSqgoG zKH-Mm^>6q@V{zLHozSlXRb)6f}J%2;B4O zkq`UX^T$7M{sM>d3(QdO+tkG|&tH~;to6$89q0UDHL%D=pvn)NzbM5R?R7uy`L!VF zoEY~C$D)6qbOXdpQ0;f59S-@F`Mk{%R!+qKIq8Nl{@FVY9;o|RH${Q;cU|`@l3z~-quu>E{@*+P`3vpkAl@NAlb2-aalcwWAw#6* z@mZte`8@Oe^<%0w`}l&*(mcLsb^Lyg0(M;0tREXVh$a1hvfsDvvLB(JHM{=c`+JgX z|3Uf%QmX%v=beWY{TTmQmrp7~cPBo7@?)0=7MoJ0Y)}00rOtZd^Xtn;KYaf>cl0CA z8w>o%`y2flKg@ssE&j0IOeZqb39_J)E zqOuIoL)(*oj=l+>>z9TZrMVpaHmB*1`1Eo4-jJkd+;H3Wkq-k1h%7e5w%sbb1a=P3 zk99mhz->4&kFa%2(CZ;-nY0GGZJP``{8QlWHS639Wn+`wv{C#}B zoON$@j)OQ3F0&ls_7pI2ybWw0$4T`0`vYwJOE?<3bsQ(T%tdZxiIDa=4SF>HmjoiK z#Ki43`S)o43Aqc-NZ4eSWa~KZU&{7t#xhO3bsSA>;aMN~*Gi?w<$pbYz~6Vz?;iPl zS@cIP&?3RI#hair-&xj!250n2IOf!Et9JJfff0o;nM|p&p zjZSW!XZ0ZZ=U5;(W!uczrLxUou+N9k&*R?W)4&WDn5S$p`$G9yJ;V$S0dv4SbNtq4 zu+IB;nfBiF59Iu_;sBWlECEZ0xIrGFDW^%lJ}MBa5q>`M{O1K-t*TQ50^ zz#?!1xJJxWTOYQ|3We)bd);eq!FPO}Ou1FwKrz?Ysh94Bt}MPQMT z7I|LaSN{Ifo{d5!U>>+a$jWJMFb8k%E%1s}`i-9FpCTB?Wj0Ebxr^dE2XvpYOQOg5 zM~OUffxvYIrumw{H{ccU2zUZ~=stfASOBgQG~RW&`0P<#W1HGj&IopX0wVcbW0%he z-+x28eBJ^}Y{}|D|4YD4BB%5T3%C)P*N2?n=zp2GO@pj)L^5La1bEkz{vI#|Tp<6P zW{Zp`1In*}$H4QRU0P0hE|Pwe9C~qOqFr{WJqMom1TF#BfD7!BD+6GSU=4T; zi~>)APu-zViLgUA2nOTA=NjF2$A8a&H$Ca^thdvDSBOVdB=KF5|1IzWcy*Be83E1% zHz_|OjG!BAvRMIM0e=JUtzOYG06BsN{|a2=(utHgye2RQoFk-;uTu&py{CZdf*U<4 zEeclUpJ(d{x@C9-xC}f1ZV^3ckaeP%eGikgX$i2Tew?#jcYwRVc_xpBq*HK|&h-RP z(kUI|{?ot>;5H4QdEhR8F-cUdVfK!G>Nx)>aD~q=5Ird5^i@hwhY`k=nL()>F>`+b zZgFRU=8v2cIZKRyXWcq<7Ptl61+LQiSdnp@twZyIYx1Sz`4Qj>@H0W%hnPbxFp%A) zNp&&JJEeaaI1k(+==@X`cgz`(X}QrC-RGYHt^jv|TZ9hF%Lcy)IFN|k$TopOpXbN(`L0k}uZ+EJ!Z zJsPkD_Aa{TJL&HM^T1u;K8Zgr3z%c}-Aqy|rrn*c6TmeBKc0~`%a}70?kk2Uzwb1T zarD1O;MHl~yiHlUEban#alGt^!+d&A*MY0lpZyHK28rr-ntatm|2Y1+58MSVF#Pmo zP}Z&FIz_LfHKrb__V@>I4>&ErcPV+Ok3L<JZc;1TrrXF}fRn91zpyr@rV++|>`PceO})~63ROWgId(&H8KdIQ$|we{hxYQI|o8$6U<@l#+{Ry&XD zn#&pi&H;D$`yr6f`3;As2&XT^aTmd(I9(rh5xI#gBA z^eNyP@Bp|&#PL8zCi6_J_H*)~2mf3pV*DxQ-$F{C$65Xn#uWSP$Zji;dEh2+54gnS zg-vY-2B#T@V$qwH#lTuX{Hd;>bX`cTo z-Na&cVTP?BC^E6R;kGRT7yzyU{{Vgh&hz(8`$w|2At!Ca8^*vPL9?CvH$@LNp9PXA zh%j1BIyM7zhYDa?_SL1Ne?m=O;7)W$!Uljb;1=*-1PvdM$g?VV5msCbxbSq_wBI@4 z0ky}3;MV(s;EUGF zR(TUxH7B|VoCEF?blb}RiL7QfHI`>gx~cp#MHapSjLWFLC$pz1|Lmk=$9n!p2>Shi zv0Gl`KH>y$K%p@H>B0XOx$i9OTHg-|Na~E~Tl_N(+you~SGaK0neI}ySvBGP)%pH) zcYfoaS*osl7D6Z#z#We--{Suqy~lgNIVN9>Tdc}yo?t3%9c})(P3>JNNA}^Y%g4%5MnN1UPrG?ihHC?$ZTeM0VYp`SwZuJWWcK z0$6bd?M@7Cz-gjXDFg?g~;x-~xm9Q-WM<|HQmbk6+E-){U1NOp!#J(|o zvmNdj#RBjX@C$Gm7?IAOluhnUQGhuPo@UN}9r%f{N1kGR4eL{{V&5#qp4I3Ig1>6N5a9d_bCdq&yKvLo>n6WfnEajS+IS3E)oM1t zDzg=JU~d7ZvdvSm;Bc7y_;i5W5irfz`MhAJ``Jy0zSyvU?tAk!=%E zz6CmP-)-S)f#JJL9`}pz_}N9_an=B@0^5E7i~uvzvl5VKk$5oXe)ln%K5da+KS7^| z86Gvs>xHb|$q}*LwaE;b1$4~^LrgbYfyQ)@Iq?I)xPXp(MPQuC?Lql9uy$7wI1_Lb z_Y{+9071-W=sT(JXxEtM|5e~L-ZaxmEgbvPz&K0RO)+5FjT^4xrxP?rRS8Jv%mgzxb-}RG*elOb>IGl{7*9*#*d#m6-}6IY447xtydd$0FTZcHkns1cM+^f3LZ4&!Tg|TfN%@a!GMtcfWUF)I zOmrGZi2T}o z+AcQ%c_iK}chR)ZKPhc{IdPnQE;7xDF&+d?v(!*D`JHCb$HEs&pI8blFRKv7O@0U2 zcGX8Avq-NXKMr1}-B;`V%cTD_tDgfh=&3Z>p;uxs6_tG;-EaTnT0jwtgk zPua2wLxNizF>h^})LgMdc(`3sD6(t2D3O!t0Lvd)K{{ch><~~apgP2k$AY|TME5Nx zt_;YYW7cGt<=<~DZl1``(pPX5vV!lPGpAd{QX?- zFitHJ$f9yjybq6&*$Df*i=idQi!%4;3+V&V9ufJe%p`N`POapUimZ7QomjMj(V$Et z5#?wgL-~EH0IK`ri441Zc9@g>vcBwxhP;B;CD?fbozHQqK4jRQ&qCd#%*FUpmQuGW zF33s>`;l1QJP?-Lm#lsc2yCi9FJ{^#*(^~+5;$!$%tBvFAZ<23FYbFHVw`53(4tiN zn&z(r@>d~#ut$}WP=}Og^Kv;kS`Y0M~_D}V*s(Ip|NZzQ~5E|y-{}AyQ(28hLw%}7GRmz8qjexikc=37zFf_ z{;}m26QGvGyYRBdHvir%v>#+w8h{dN4W#D1*gR}1umWxQwX5!s9J9UwJ3tar!M$&k zXMVig6|Y1B`C%EU9D7QJfP5bN{W3EN-h3?T){4b))_$-HKJtH*=ZBA+u zCKByDgXP^p!>j)>rG!Tkqj}1%iFoiK>s_AQJ47+st+Y#93E!e1Hwh*Zw5*n7txD@9 zn>34oXm9nPOe+TZFtUFyrs;{r03~uk+F{xDaZRl?#8#&yB*cS>Maj2+s!q&PUuM@m zc@pA;SVpW(f9n{H87u^%7{UIF4%^&GB*-5+ltCmbbyGcHDw~}fw5%zA)1t|S_`$Ir zL3u0A`WjxB8nQ18G>BY;CFede~gMIA@;qI+>3QKG}wjI1d^2e=Xldd0BL*6%S9@)x9Qg@74oHf zKqLqPYL)#C<<}H&I5pi6Mh@u!wtCD$6IXp#<4P|pn0BxVU$|8)+W|H?IR(+L>W!yD zUTa2c>c7N1v<0#j#i*VML=(rhg-#cK_)J@clFGUV!;ae580#UWfx7vKi3TOy~#}p6*2b_AIOvDkQeDZe}y|HYV48kNTiweO(c{w z9&2x^s0pb$8c!hK)l|^=h%I%ybO0QY3M5sLw@?!)@C~M>$y!B)Niy>l8?t^RqB=3o zZ|k21>9I%5BnuYx+Vm|F3VRgVpy?(sCM7?$j9Nvv`gf~`F#-12Q?xJZ6;nz273At- zuC`bQcHIkDPgac-(+YGdz;BLp4Svf$(5OpKo)y8YM1b}j z;tzQv@%5}>ZwWD*f;)hf>Nj4gIC~-^;Sz*19lwFeU<#POU&q;1dGP`&8!;V zwY@@3Du8`r51PePnwj6U&$R#>z@9tNK6`p2=^i#0N1YB!=T)im>NR>_`(gl7ak-8G@f!4Tu zD<{{Esv|koaES)YK<q5fB5UbF@=_JB1uCHpeHyH6G|HCUnQSpmLSA99tu z7AuukNcHFy3sZs^!!(`V05Ol$Q8SW^1$a>a}j_W9RC4_1BeoW$$SGUl4u3fv*vy_MSFg+9>|39^O0?x zO>vy^1nx0)s%9b+QW$azoPQH$|XAyt#6I_3k$9-BU2>{uL3o9{!jeAx}*sbA%SMa z*U3No#y_8dWzH2C@jcnbYbwM|5Pc#4r{GZ1Du3Tx>kdXTjnbgb-*WVKSGpzjVog(M z+85j@@=<{UcDSc%lMR0s2l=cJle-TJg@g*DF1+Z|VO#;0fpwrB zxFeQi{hU=DE#``adaDG^dtsAkt|&9YW>Bl>$IOA{C;%?`@)39hJO)0wgGB^w9^(Qs zi*Qp>MyV{7c?A3cymC%h#K>nra%GluRUOnMX!axErEMcf*jO{dYOK{u%mA3`_(tHv zM+O_(Wbi)7i^@}hS@zF@x@`haJ|ajIQfcfdOLAT7xZqmV&%hI+CbrLCNs2Q+aMm=W5V{qi?Bkc2^z&p#r7GzL1 z$g;iFE?xM#O5ny9Hi#6lF>8_#ZB|JLSX1&tAA8Mt}|Jxicz6#J+qAJOP$XenaZd zQHqf9zn^_0_x^tZzd8B`0P3t-^)m(HiS|n*2v{fV=tG5#O?c3Ls-cp2ZO)|B8R>rp z{s#U6UgvYeF_OLtjd5!D967Yz~sbAc~7ANJT>KKR}v~=Y*LF*qXY_Byo6eQc^ z$?aQK`#bQQpyzLNl)J&$p}+!5UaG-OMj-C?o&kRW&y9gy@*^)RcynYrliq|Y?VbYv z0v-aNELKo|Cty=bnPjg^iF60}0Q?F36L_KYZwUR$LP85%Qqlf@6Lh{!f77zo-lZiw zpUJmxgf0Euv=`_y`_hgQW+%xl^W?1OWAe|Ap5J4CT}LOMr7&Hxa0z$_{0n$$3cSJ0 z}u1_A2D{db~a5u#q@@ENWVwGTU`qoN#-n~*X+qZDZk}2VXL1i z=se`wt(_~7PVR)mTaNx2{&89=FCabnEEQ9^|Br?)y=>Z4W^t&Cm0tkA0e>j{6DDu1w?&sAB$7m^0V_nl&a9(~ME(QuJ3PK!m6^U} z;1L~wug3q@*w&YAXR1qWsv*|vk$J!1-I@>C&zozA-St>HT&Dhf-Jw5y`apfz2_|!< zlz;wd`ZF*c6pOg!qsiAfKw};38!x64u!Pz5`3Z6PoW8ZA7uUa$UaE zT65~0dwdD}MF#$;%gFXAhSmURUVp?SV_!IcO8??ESO*L5*X-UH^;0tf>U%>MY z&}!=fNF-@*c59zSgZK0uo|^lo^h>i{SWQ`bv_s5Lr+wacY$L2N;7{296v&|(Ow%m^ zj|uwyx%>Qfg`s#bR=8XTK8gX#(BFRF6DQbIXPvOO&w+md{{~)m(7!5O9HEp^nu0sO zM(h2%S|1bOnphIEA(qZ`dFMC4ALRaJO}5pffzTJ?*Xg>f0Uv;8z@NY$z?+Vx+!b*{ z?9F|8^Lt#!`w@5sJOrM1fPOdWrZz}OW_nDJYb+Fh=I{UAv5eBr-;jQdfc#%!o#F%C_rHJ-9rU;4oyCh0`zvbVJ^Vrb z+3z^NQwHWYS`9Sf{cTWwfA2|uWnpAH>SCt3J_3({e*%91AKc#$?E>Ba5rH|^xe_hR za^DN!Z{T$|XmJXBroX!fT;wiV*CCexn*?2c%5|olsqQV{1%K@u2Mha|_uB$K$U4`R zj+Npu@R53Bl~aa;9E;v&lKegJqUSp3g!u%QUp~jFx&cB8Y;Ya(Bj9;Y(CIDUm6Vy! zX7_o!LgPE)Hh=0+;Ew(=73Ep(qe|(xY!USQG4Qq<(jeuASnJ3-&2)xO-|sW9MBMC8 z%Kh%M2|S~^`p$K@uBoENjo2T7=fD%-eTM>RGM;vJW1Hy|mk!vMhadM`CmvIvoUJ|2 zRg@OO?-F$T3GkxBdvKp^;3dUn8#qfA^eAB~z*FAeifSLDzsE4;6dMZM(JHi_ZwdOo z)sy~Ue)t@B*Sd_(9s!%*k^W!N_1OTPF`s^xHSQ50BvHP~M&yqj=dS}x00=HV$HCTu zbZNNQ^T|Q{vkds$k#T{8!dB3AvEK{eDe$s;6|3=2lU23_8x^LO&U;~!Tdx+sfd zeeS+l;X3M!{A#}i3cw@*NoIM0G2Qo1z|!G>7(Ot?H3Os6{v{TkKCxD~ii1CWzzi`( zCb>SZCV(C*zc0Tr4hLiyC!oayQ{s6kJbfgf$bp{$j`c5a zSw@q4n^px7B+$JuKp~%Nx950;1-}0#@D*4FmJi3h?&JCAIqMow zs+DwSKeHV6AEW~MPC%TmhXW$@0n@-7mlX^EpiA)$ zSOz`;-wvXGfg_=AQ^^c4NSHGe+{&?W>X zM1ID&dvl9FzI-?!Ou!$X=j9c+IcSr=|M^Hjs#Dx-GLzMxD^$T5{_(oMqfAJ(LAvi1 zn&s_7R7c12eUj3R2??-$97sNosA>bW;MRAxeH=(S%cKoZfw-o<-Q(aHf{=0pQb&!5 zhj^^>2iz1fOzXEVcz8!YZp9YZ||RH7ONty+DD&%kmq+! zu=C(uul&&@9AHC0KZ$2Af2{N81d`KTVNK-c==1wL{|GY>75@14arnQ;O*g}AhumYI z@9rV0+dgGs9*f|2tnc^8=L4a?RRT@_cAnomgl4tuiCf0zFQ;lC366}QE47{A{I)%&z%*@zJ?qE%1hRT5 zV4c++;b<1{rPn27jbZOZ{1Y-fHTvz#2*`1txAUKje|%nEk=nJvmp`uj2J-vXsIzx` z{tdZ#!{$!y{6~|}BmIjqwQn;^$NH3I+osK}RgN#e0ncA%%je$l-`^MI8$Y9CX7>tA`&WV47`)`y8!E=!RLo9|AfJuEL!lxJ`*U=5;>5b~qcAfiZ5w zon(&F&DE|EG5fG6ImcDJ6U>qOjM7|=?aSdg_Ca8Zn+*HN02LO`*A7pxyYr888Q~BI zom`6PyUZgUM*j(JBOaC%!4B{5+u@+`gTN%W`Hl(b_8Rb&^H@jIe~OsPmg3qbK*0CI z#ULbZ_cV+AK2s!igssERKh164)}*jU`hPj{{3W5^2z6MU%(G$x)(3rj^4}CNCd(ge zzVOT8{6EB1)W-h}CiuT{nD8($E)v~zoMR(_pd+l>fc_!$pWrs(A*QGr9L;w}IHJZh zL;4Sh^|3PB+TrIfaC7S<_g~t0!1r#%@}B1(mu=pT|LbIdqxolm%41x>y(+x_ZwfMW z5dTaQ^lgyxZ}W}II-l9|<8gWEDY9V5hponRnEo0d{U@0sb6t*WR9>4n^iwAB##-+m z;(sMRjjJ-h;*4sKPYyT@ToKqxQ$QOmQobeVa;F-!3h`O4^B$oG(BO2UE3tn(NC%Ds zXMhVVvbaW(ggs7Q2wMI2p!4?ui@;TOX^hJb>^*k*xGtR@T|OQ#P0;tVlzx|^uW%ah zE$|Zfa1alV0B3-UB%XEI#O$(o2|NYf9>l{%B8Ofh?sQ&iG_K9?HSiYr+U-jFzzlGn z^qVKVYLhJ*pLqVIgXlj3oC7WpbbCTp5A6tO^?~#+0;hqCzy+=^@0U93S3dt5_|koT zNB?ugou88yF)Nb42A%^S4m$r3ut41O1upRQIh}1SS|1Lge+ifa&Xa}5SSj*2;Q5MO zZeM$ze+D=QTmsJ1fLoF9vve7KI>?nD0T%iDr|AGBqCh?YZ-Avk&OZ-a2Cfh~FHZyL zSYQcw1-v2bOyBKL{Fs1-J;z$ljv{d-R_Hue)2^l>YAi zF0#oahY)vUO3Qi|pLN&DLtq}bMd+;40{GCN2kR8W5b&g;W5^#&4_oK2zK4tshf`DFkn(`8H)9wRTfEl_k)~dV( ze5GP|+}-IpN6`371cT^j2)qX@0Ox2`LYvXl&oFQW_zAcPoDu68Gm73MoNod6yITed z1g^Y8;JZP=9&}SOvjjs&H)!`Pa0R#n+y*WJ6M}uv;QgN=B-Wd5KX?>42iyd10at-p z+4&j*RbYXh`5^GJqfg)Hf19}JQ&MLPi4k~)o^cQQp8~D}cWJ#x`SQ+yJC(3NNWLCE z!~}2^xC30H15wUySlI$T0h2%xc+qVGio}h&3tVQKBFH~3h+7~aVbGKQ?)Pu;{v7i* z+2(mho`2m_elG(zh5yFbWD*e~ZHm3aJ$#@Xq0fG%{D#?86}!M$!Zf%s*-rky0o(*G z0*n0dlp?fG`i}$o4#axn{|mr9%HJf{+ooj1Rok)OQ~wl!Mc@Wi;Auh%Sq9)da1nR~ z{KY}BPX4(8+@|uiHsyegR>qW1fREk$GeX?Dd%!v3H+cwKPRyX7ELmc#9m{$=z%RgE zdcb)Z7a1f-g5`ypz_L9*1Kb4u0bFa_p4P)T-oyx1lH-SWYCwj7OTaI{ufRE4kAf7r z1_dFZDQhQMAD^(j{{`IOE!(_t!okBbO^Pd zgTO_iUY=*{l+W&M9h0@_P2icHKLF-|pMn1cxIy5(L^i+=Wr5C|Rcy~nIqTzoe=fUo z!UkwY2x#Y2>KpYV$^@PMm7wu%+eakJGe-%j6>+dAL;pD%(CftQwuzyT19Tz(dy^)i ztvxOh_3olHsmHVdLqzBZIXd)SgU~_X0&t(!!;v821aXPXGbBJ`b>;sMSRiocd0?0| zkO2XzA7b}a8352wAHtpgK5(D>QjBq@@%~;lLcdq1Dy8$g}{sB0$Uq0r9Uo>hjq@iWX}H!fm=s8 zE*OXo9%W&{8KBPdp9XFcdvu0B60q0M=cU*jeWd(930wpILDbC=25vUNACrH}EJn-lkCg3mE#ja^r} zO1=B^opaP4u`CR5UD(c>-e6X_rTv`y`T)4>fE2TfT0HXL7_bOza=U4p24lc^;5L1~ zoNV;hv2UM~P}|(FTm>^Pw}9K^-V#~gz5;BTh*?tEv-A7FEN~6DE3n0(5YOknIp^gy z*>d;B&OZuV0`3Dh30vxCmv`nk2=26x3kY@$b9Mo^P0Ub>I&4@qM-~8ZkzJoV<^=k* z{`Y`O#0~_Tdt%;vk*deKCSTk6$AR;}b>ITEPfnKpxOXW?=l(vorPcrxXgzL`{y8~3 zxT>=#{IkWag{||?5O(;oKv%^wZ67kFTjJc}cP__kDL@zg+>o8QIk7>3Y#~Wl(%3P~ z+0(!^W?shF-R#RPSjSRIhkVZzfCo$h7l0eozBwuYYm^I#4Of&ac%}Sv9=HWuX6zV? z2n5`_Q>G4%*mYPj{=Wk}AZ%@cepgH`EVAnpTzR*p=br*DF?^blg)<2c;uPo%TZft* z=bxthT%nbzaR*U>5kMXo2F}v@ZY%$v25ta%s6T98I7eA%07a5{ldEo5mH%%7cL{s$ z^UOJ!8g@4D9HY@q;Eftk=Rn>DW_kZMshpFtu=C!wfHk9k8CWFj@D*T)t-TG||KfUY z=K(OL-?1Rp=zkqp6fZ9ZJOuckvdz56{6|$m+jGEK;688vIe21LkZgzk^ z-v|DK1Hwf)bRs+ZJX!g*!-7@LX#{0DFgm?Y8dXA@!Ol>b^|+jR}hKivl&0Owh| zN`-o6gREuKns;}d`MnMN0$iexYpUC(#Z#^^DsuL7)h1>J7{1&R-#5svEQ_g6^D;WK z<^nk%SY+_KAW$QzyhToSe&)F~?VF?j6mSptPv9yrB&(Z(?6NN~6vzQp7K9T3L%=oQ zKY+WyDK2`_?#6;ZN`-X7H-I%q{~X;9XT1jLCFtrUU)+!qGfJzD{-eM(;3wdkWIiDG z-{c!vSh-8k?ZnalB5;qfll4z{bfWD3Lxvw)1Wj+a^PdBLrTbNssK>6&auGJUv2F!b z=YHM>F7o^)FtKcE3&}!0t4ghcstaH4@ck|JvgIM3?C~#h@?+hG)*Syl5C_aMclxtM zVgnYqHPP8XOaXTozE8=%Ideiop(wcYr%nGshCjD~MH*VO-f?#CvE#owpRYLuei8T; zxC6{_UtN=n;%#0#rWteY*}C!n1Y?J*qCX&P7jd7vN5#C$jngguzY6@!_b>3x6Mjo5 z>M$hz*OY(G1HS@40cT}jT+#(O5lU;%rZv<|1NVS`0M`lot$#Z&p=2a`!`3_Kf0y2G zQ522`zpeW7^y5Z>)2y2-59y9Mct`XQap}<%<=p| z08Fy*HOa`cC<-v;_Mj5cp65w_7foP@qmh#mKBWBdP*8MI^3Qw+|IBc-a+JxZqHHa* z%P$hJc#-_(yZb-I(aJvQ?T)iT<9s^j$IS~mQEOG>6jv?gMO=M3xko?V+HCri{-;^B zE=c!fU9gl=Du4k2-41-x;}pFIRs9XY!U^T&&r6aXL%U_Bbc1+YEh9_BA>wERg=wl$X}|7g=94zsH2PAqZ0* z`G4B@-*v7mu)dLU_gq1;NXE!zU<8<@yaYZl0W30a?q!?K6Okg1Xy3EIS!Sx+@;JlJ zKl6T@Srj5?eO#N)qJqvl{y)vSgD)yDAr~|Rmuir`F4InbP6FrHwcyL*!J_Qm3wffH z^4~tc>s++1mz-Q%4q;0Ytsx!U&1?05s2`YMff?lpnlgNm!PpwcuG|ZflsY}0W z*KR6zFv#7!{lIp>{Ai#2W-5P3Mu@*Gf;4zN{l3@8?` zH_WEJf;d~|ep(!nIcx?Tfl;l^FdK` zazfA$O2CNA2o$jIE^|bdv?elQ<7M$rX-U^~KamKF{xbwYVRc z0_K2eV3h1?=E}U;20cJmkC-NCcx8lrdwqiOm&m^OJo%?dg)z!H(Y6l^0aILTkrM@E zA-E?v5Jo&#lKF}(2zimmu1N|eFH6`Go`1@btcc18>7VBh=1Ij+ED>mchJXp)*OpJ6 zG|j4eKp(e}h5Ksqc8jvxIRv6ITPH^;0v_FL^JGBIX;6)~MzKq%nJ^S2Qbsm`Z?tee z&@B0X%cEN{Ndp27PBcVyBfC*{ko5O~#O|{{oBYa)M0z}t$0-3McYaDBpq?r-*)FlJp3OM;j)Tc`XFS)Xwc zyW`pJlvp_5GKcmbb6m{r_;t$%1^^jB=Dt|p7?9bKxV1YoZ091Zqavjr-~#l>#(v6@ zJ&Q#L#|*@4Tq@aQIw8nYZUPZn%l_$~O(lj>Uu&JD$lpw}*>A`sjA0jmb)S>v;wn>xWmOVY;X$-K13 zar{<;NkPV5ECP$^2LxGH!r~(r*@YG-di-wV!9*n4sGCYYxBz+@WyT&AfeiF{{+c90 z65=P@q+eX(Z2ll^GQrBg}SBoD;SopT2FG%pZ za8V+DvGJQB%%~-QZ-iu%m5c z+x09$OCZNKVY24~ps7hJh`;5_s*SiSB5hH}6UsLwWQwzO;8+IkBU#Mb&X7UYO{RyK*L;iceCEL8LNSVwhS+=$7uuoflJ*ltNM1ckJYGzepdJpYn zixw(EI#6a?JPE%bx-BsN55;u|m__n<^OpHF$<@E#`lcL1Q@e;I?`?7(M|z*K4~xBZvA*o0FbqL$$FuztQiaEG>v2lzpo)$G8E>Ic zUsF#FI&e8L2@r{H^S@F^9ViPq0_g}#vJP+r5nFvpqqYUE+L;isBoH;ZX2x}vYUWt) z+?O(-FCv^dY9ed7XnMo;w-&@QxF*(pcZW4Lbk+rGqT4MBS+%ON4QmUiH@J+m!Yqh} z@ExAD1w#9_e6cBeYOL52W~Hae@=2L5Tr~p(wq@-IE}XeF5ubPZyvY57+t!6pXIrQV zU`mk=r66g6c*-7H_sXqy_zhr-Ln=ACf>yyx=y;h=?Y?5qv3i}v+##upS=AL~DcWQk z=q}J`5&_%1CUjXDkV67GYJ0Z1v?*&q3Lr=L~T}KrM6P3EHrEeKkQ$uNnVr0ILjyjRk!sMH>7TpWhNw?3xe+`5GH2 zeetn0CGF9%sgY~j{J&4^LQCN0vhJ^G>;@Hn1G1@OM_SG;f0{~C!=qx`BS?CS{*)mU_xZYOK-eV_xy~(z6>r)Cu`ztLHpbVHO94}7*jJ}iS zT0LdofX(ak(YWIzS9&b=bFy7V+o^LHQyydgHg=Sb@c z@XeN*Y;n)XDPCbp(^Zt1*)&j zB^7eC#5=$@;EVMzCxoJ!VGesp6ktgItWGy1q5@sBL7qMC8QP+=Xw5Sol}W&yS&cf| z`3z~D4Y`}_A%vJ~WYt(KdtJU8{dd?CT^C=;{80D)1;(Bgf|4v-8{st|9XS@R_i^ngul{)bg+UTuZTL=id`~9L@&bfxN%jkUI6vmQ%QU zgk#udT0JL(u6~LBzoVHm6i%p%0% z)JmrS-vYy2;8qliqt??|uU7=!YAc{+;5|Fh0_vSKYkfWP&)1IgSNNaa17$V?LQ>ae zI$};Bxcl35&#wdTfVXVSu(k&+<9HmXUjg0_bZ%eiKP`jmc@dhXtP6Gm(V3I#>+8d0;ok^?$tH3LQmQ~CF`T~676j@0O0OZ`Yj;63o(Z6is5ufPiq0GFr(^PH#k z$vg?$?>_)<^!$#bAGiS{FQ&UA|LF<(-Zz_Mt`LoIj(r6k`qQ{5;{4)!V98CwJN@~V zBcyp+3@f6C?Dz1vl2WN)MEqq zLeT1$x(+Ck-k?EFY->J0ruA6`UJ!SB+34@dz!kD(aqz#p;l09+o2sv?;|IkjDfqUsL*r><28107K}PaHGw0!p^oPF>8!BhpB+mY&F)(TkCxI zp`O3V4AX#YkTu8I{@@(nW#B3B5O}Gu)e)1teX;=2x=73c+y<6_hrqv-{!R9h49T(u zWEEJ}ZThGn>s0DYb-y8QbLRY3gwmv)H6o=fdVB!>0R9a;?^uRblRCY1#imlM zYS7VqM$qqNJ-=08i?T-CoWO`l$8W%EGC{{sF1K1z32yS*?KXUu$e>pWZoUIPCD{;m7y zQqnJ=1L}#=jVXMaz&pxs#(J6!3B=c|^X96&-72k@vp&lm=Z|QH$_(Nl6EH4mZ~XIz z?vqR?Ip#+BqJ_+QIr{$z{JVRZXO$`>Ct3W^MF9T?lSFTUzkxr2hrkEfGuHa-u`X5> zd2Yxq+4sQTbbmV;xK21Y%d=GYb=w3?eFgjl{0n&04LTjO5urhYY+lDM_h7!G^!};Z z!#=z671oB8lJwt!CED&kfY%-8Z?Gt4?Os;FZ7@ms2KbY{)0d7PR~>Asqe7&*4d5H_ zjh$d|4D) z<1Xmez$4&az@I(Ozh`w(YL5o%TU*R~za;;>?$`(%u?QPyf4M@t{SkNrJOTddxymyY z`X|zqQenSv2#1-zVEwrOdm{V&xX_iLs}1zbYN{ukgu+FrQmlvcX;0f06#H z9Y3D%_p5RXF$HOx>x7>Ie*uqykKNm7_vHN>!i#n0m|j!<{sNXdwt3dM-m1oyhdQaS z!(Hgk_4rL==MMg_$>?-K#kS3i{}S+6v|p|J{1xf7HQ#@iCDyO>o}2>T?x6oZA8vLD z?;3ZpJMHI`v|7Xu_u6s0tNp=i;kN3X>JOO?Keg|H6pb9hy z8M7x^teGuZyapaJc3tUt{#w=tj({3M{9E?Hf9wVgwogp9aBk*lV4BlaR`~fq-0A)9 z=XZct4D#m*T3ux1;;L*PfOp-CnBAw!ti(F70-WJa&Kd>q3$O&d13vdWe*%1x6x{-F zn#0xm%yYg2-fQ+=KfAmZS5KHBY;u9Stlx1T`br1)+xg!z%l?{^;=Z&AeI@AkmxC}p zR%i)71E)CaT4eHR1$Ym<13vb|^w?)qxXOI2o0U$PxqAn^1m1URPxq-YD&7Jz1as`AfAzUmup8Y##+^Y8qM-sj&2o-@gMo|_e{;ck;Y*xN(+ z|1I-eZ)I7Bbt$Z{&eEIz!Jxq9Dd#w@UgGAR_rOcwHL%>1|F*cG^BZsmm}EmhlW0WC zz*}JH5c)4kiEEK~5)sY77xM2)&-16iXQ`2%<+7C~H#=;S{-1jCzoUN@Sms&`*OSqt zMtBdrIfVZ!H0xi0MH#fN(J|ZP`Ime0zdL`#Ou-`eDp}9Z7hsvyv)=miC53N+%TD@% zCa@~{^DzE*L8<%w)2+=e9{zV8z*@(S_!K#tIYtAp$JJk(N8%m_LNWcOyBW&hQ~;kexML%=xk019-D z+~K#5L;pS!uis{EIk>P6Z1mJ#O8+5XOqM02G@z^6xpBz(N4Ocorb`;^vs^nIPRe}> zz&O7m&lJQSHz;mu>v621Ek*=XIu9eM{FIF|A#5U!`%I`O*psp!};GQ^NesMeUrcM zF0Xpz@Am_vz=UkBuCecV{W$zn0>*#|?v+XyT$&0w}bh+b<9Q3{ieYfFe8$^Bv+NAJvbS z#U!mBPWSFlku{x|HHh>Fo)094f3n{XSy1*F5gtuC{S>5ysQJUc@6x~#qtg?7d>L0v zWT(OZSLe41x=;RTp6GdBR?=Bg;shU?s~ew=-@hmCe`5N3(nV%tQa^J3 zP=5Tx{A2u|Fc)|N5iDefs_G||-%u)H(GQ$I5bJB1^b;JR$oEfW4ciZ%--^&DKEF-% z{)qe)*-zd$Df6N&zh2M(*OQJHW3UrnunCwATAkqI%ifKcDaR9hOu;22(2tzo^{CY( zeRhIRF*_V|g6BO!j8CM_bu#)DrP_Mp^9Rf?fd=}-23{4!iRqt{hV>s&aGtz>`~wQQ z+rXUc_X{)-Df$1z4nkg9Y#Jw$ACIM8Uy{u~sGt)nxF0mIzO;Ir;0U?w`=6M8xh?~9 zV*W4ucN`RTuztir33!G0F$cSw1!@QVz0PxwFh5}Ny~Q_2hj=cQf&>~KN*u~G!~`~85&_;}-_A@(?H1UroHw}73)HDL}ImHb+cb-Hz?xQ=#J2ALuo z&cd5)aGLUnPch5FJ{9m5NpP4Z%`wF^%Aun~CIi=xlj0a8;PVhud{zGVu~Gzm0vK%u zq{2k$_K_*FQKm=-SYz1bWZJP(yo2QbA?AlF%<^mZ5{1Wi(ARU|o=72N68DN$&3~HaR z#O?lch&7!87Pt<4f*Z|iJi^kEpwp*;^S~uwPB2>=?9qD;yzOogQSO@s&ags1K?bU@ z$9IW{(+8e^5|{_h1Lp|d5|Q=4^D90bROuKZw(uEXhDE{}iT8;Qe?Nr&)4*9?;TUtg zyV)Y=*F$vp1TfFTkJA9h>=AqqyaGNPbpAnLo}l+rf_&meIX@6F{xJGa0cU`V9GWZ0 z^KO`E>5w8hfJxvCa2Ysj+k{!>`cB;N<%9UY49o!+fQuaZN+rSkjdjj12mN?R;D&RY zx-UwR@;jR>Rt~4Xd;hb*3~PioP5hQ4e!b1m0O`L#;Vl!M=rb$i9}m$#fq<^hi33(= zm(JTG`F{pD51c1_gNuM|LSczT_TKdG15N`Mfio0-_xv94naXGR5dJ9wGr%drpUiS_ z+UZX>AG-v6KE#Yn0t>)dU_qewBAE|-+abT9{b`V>gSUiw9^?NC@Ev#t{L!;b%5mRW z;5Kj*SR@?404?}7JIx*fkAU|*b>I|n(=YSw`()#=>+^mBJO@kZ$_T-4!%=fp!;T_-;@DlhN_#0U2rk@AQ0k;X;J4xk}u+Hup-Ch7s zfY;sU?<4B$UEmzSt8Bz?l`Tq7ffqf`Ujohm*MUpGDF%U-K{5Gz%qg`V`tuZUogz5T z3cp7;z;$m;1ApoHTL!>s@4LV+z*Y9=SOVWQ49^P~c+$}-UIZ2i8+sL(Vv(hfQSdlh z7lv8-f76lGnjq}(J>V=&M<{GCM@TK#`tw$0s2?~F`~>_0Tx5TM4tiN$s-Jn4%&0*? z*wCMWo8IFi z3?>C$s10`XzfaV^Q6d014{`*k5*2QkB9YNS1>g*D1GouXV~?**oK*?(GtSm2bf5ne za0|FY>owXnE;lAv$_@4;XQcOM-~qM2o0qk*hY3jvy8Fx6V4twLKLJ;${qxiTO<*LO z*MScD82xYZ{m%*Zpp9?W45h1^|NBV)pMa~>emY~^AX9rB-OMzbI{M!u==m(=&lekZ zN_w|__B3~%{};;dknB82I7B_c^E>)y`2Qm1_Yz}Ao4D&^UTl_pk@S?`tH3?rjsRcg zC6PT!zBK*QgZ}>o{6ysJkfIWi1t*E90Ou2$-b&ITn7~!T+8607_dfwQfGGifgv=Wc0rPA--e+RA^_c>$0Y3wGfq8+>@~AzE z(zIn$J#8I41e^nIGkh2l-yx9gy__F!9pRPZ5uc#50_$ zFHm}|N6;=njakgHV9lNX2JjEy7I2Dr@45_bCS-$<#$=B*p%(ow0Y6du^htxLoBzhBdthX6PO{LJuag!i1M6KCuY3*>>T?AHN|k^k>-Usour)kFF~1sb3x zu+D9_08j=l01tpWz#`kb^K$=??NMd!40ZHt^Zz~IJ}@uk&%7KyXAon|s&86Nag4F+ z4Q9SVVHj&9i)HC}Mt+OHIpAmDXW%@2_dwP~)ny+|F)RNK_ihUeKQ7aKHOkwX z#_TMpI_r5C_zwX+_oWLcmDOF=xVq1E?WxiKXW%E`JkuBG5*uGuNbFJmTl{bO=MrNK zKMPJ&*PXM=oUrGOdfs^off4&4%p!~JyKS=HEVy97Q0YvsIii|89Y^!Lx z^Pd5J1^yGb%6*MVwt3Cy@8W=jJvo)Y1LlGIz8%o!=RdyTBstvnQ$(*?r^S-ja3OcQC zQ-%=KFCfZ=EPPrjqtXfI3tS-XszknffpG)T&h9z%nM2>>GLw*_gsncqA}}WPfpWHV z;ge`4J!MwPXWaQ+-|7r^8+Zc16;TuA1rchGSQ01fK$w~ z7fQecFvrCRvHW-}j<03G9I;Nb#|mKI+XO93on6bdEOfa?25|Q`%N{2T0Ml%H$Y)nf z$AZVzl!1V}mH2bO2)7E@XmXh4d&kLj?o}No$a~r7KS|Z+@u!-CU}?(OnPhj~89RT8 zbFC$Dxy^M5gdJTJJ;6@1NuXFHi1|2^?ne2Tr?5E8sevKhf5|+!Y4Q&QbP3{##Nx*e z0y8|n=lFk0+Q!P_`}?AIz~_g7)4+lg*%B}foRPXsAZZU@7Eb!)pAqiHZt?FZFwfl- z0hNw^eA!?hv(aDsnV_ScdyRYL}?d z>_wcuJ;g;9E&Vwm`p=VHz(#&>SS< z_>hc7#xe^934z@OT8Xl?<$6T6Ttl3|K=|30n$!^AKLA1J2L*>4S-l0G%coD(&tCmp z4)pY~q?9?C?O5~RiXqLFB z*fO7{?jaGh$zniFS^{!K)JNHJkQe%g{Bx9zAXC=!T#r?2OZ60Er&vVTSU4xB4Uy;^ zmn>_R*|BMv5AY9x_^}oT%;zSq; z2g3bC-t#J;?-0x!pr`XTE z!(zUET$HsqzWDwSePWu#mt0WZo!=9V(YirMcJ~FEG#10uqQ9lLa#H)Tkjs*hnvOA6 z^-W#pff5S{rv2s5$ZKmzOw|r>EOTffY?bB5Fq=tE zgqDFb0C~}P_TvddseoiRW+^ik4lTcG!fn*{MS;Y8oq*c2-_9S1rxt1y#W9KG z3VjIA#3CM^NL$2f_Sr#T868i5Ru5^}l8`z)Av+o&S{)z-*(#ydFP8FQj*f+KfpJ~a zf{j>qs9M6$ldDQavAR?@cKwxgLbZl6EM>WNr_2(3YAMn~QHW*|xPhGd@n ztR?j1yPB5aHzi|!pQpMloBuRgT3NWg-#KE!CXipY1DihuUXa49T#U5F~P}d_C;rB@ePZ|MR@~fVa$4C*v!ailT zDRE4!^aual8edZR@j&?6`HTkM)%AvxTIBu5`Ipz&v_zb>YpFli`~R zT3w~@kY}54lYfSo*pM$^orH~sfUpmXH(>0h$t#Z}aSY!8TVnQT(DJhR!{=q1o_Ad3 z7;xVd~l6q0CaZ8nVRXd5^zWVJBo%7r^B~d4KzuGFFGW#`Cs8+;@Nm_H=W7$2RVcsON*<$tBEP0i(4Yqy$IBU>jY9YHkW5H0eBvtC4 zSoKd$7|v`Rq%9pbzz(0ZGg}?9CIN}h2DGoX7bqe9 zcd`mp-6u_rBox3s8^oz`v|&H%7x|+1n*sq+w4p?o-UAJge4% zl&~MV3!>Adt1HLjP)=n?Cj>a8uReNdu}+ z>-*jKCuK{@3ZEOO>E>7v0ZCVe@&c<=xM@N> zhp$xNT07BNKMbHer7T)`8g3e;HU|UwxC$>`PuLp!VJYRysh#xA{u0t*{{}T90RLkOX^t z{*%$a$}3%=TcWYNIWT6Dcd39j!H(!<;42lf*(&p9trO3f^`r97HutP7bHYah3cJOS zzR@o58Tf2-9IhvDnL5GzKusZ8g~ul-%(7cd*d#*DJKBI$*xm|w4chQsA_uk4zr*kQ zDrTr5gP5LpFYf){0UvFS#`T1MWLsjBVyjs+3w+$mzXIL@-(3oBjYv$ZG~o3tV6-P% zpbo47pMY=1|J&U2vqGOB%*KIQkgss5(r4qJJ)YnAPo2g@{BYAh8zfemlK(%851ohu zYxN{&!;ODG+aRYC=@np`G*83+i~e=6!)gA)eBd_E=tQ#n8)pz(hxa9qOF*Pt|0g%VB&(%HKW zd;tC;Y;h|Mm}g^vHAC6enL&>!I!}nYwrA%LvY^%0cWJ|4*8cpB*p*Lhnmb{AU`R+{ zu7LS*DYr^}03HE<**1WfjXlHC3+0I);`2JT2)p`7*AX??TsS8518W%O_0~9k@dEfA zcx)SOoRpV&{=sZ3bwm0Wocp~DJOv&b@TRM-5AnyX8dzhDVK&H#>fZ#tZyBJbfR2}? zeczZCq@R2K88`BLH!bH$uUkp7jpoFf06O|VRQ`!bzY%F3Nu_fvr1!H2d;*?ypl(`^ z!vKB!K!RXnAs5Z8122Go0}qYJ`78~hvaw@vaVEOGWxcvC7+Ghh}9>;g-` zL#4m*&k!4nKr>{z2;`s7z~i3u?-TwhbaezZnYsA_JeQ4ZseOHkB3RCr@yrc0@BJRF&62)0 z{TmA9J#pMWQu{w8zGwUV5tT++s&O4@Z1)upNy05Ve?%Li*RJ{aCaupC;NQSwS9bo0oM_GBkT*95*`n!i{GXwJ zgZ$e^Gmw+;GiFS`PUPZ89k9zv5(I>Aj&cZ?lGOzr1fyt>kq1gVwbsVN*gK)@0TAJ@6R#7u~0AeLiJTHqP>GOw#Xg8|0tBKk2@< zKNSgkEMRMjAG!g&1OB4#^|50URD;=1lUSc$QDxHZ9q>2sFNP1D=hyd-ve45_rioqy zk9#uk9!sfB-kvq-IPrFA2FE=a>#OuvU2R@hljr|L+;4aLoBDhrMM)LR1Uv@L*W0R_0OEYDTv=e)&@3Nr1(hw`4{k_oBsRaMp)^i zK{enqmv4z%-bw$eL~4m7kIZpg1K!j6KI+N;H3>XDi9l+^tNcRW|4-m`2QWY?{9`p3 z>z}i_!CT-F@Mkyf{l4UU!fe{J!L?hjfk(jaL;$Xx~49e*sTA=)cF(d_%O53Gyzz&nMJ>?>f$37v*8~@+MiZN)&DLp5GjDe(QZO z(^cpB-TVKp=?m*tY6(Og_|7Rq3ny#z{ho7|Y873d?_}ez+^0Rp=>(^MKLF2xR|kPM zSGk~XgP_Ud+$+>%q31pD;vn4h6}rja2^v1kSh~i@^C|EOSUCvv`W4;W&%itfW1Flx zeFRY1AjiDzC#Fi@>l!Vk|2w+LAGrIqL}j{5F!Gnc8{lgP{Zq!nF3dY?gVtog zO<);#2|Vw)3)%R`6?m6{1pys*g+llIOO{%C(0_?F!ewBZ5xFDbE^T;k{z-vlb{BjG z&T%E0GbR-xmcIgC0n0t{EOvo6v^DD-G%fRJE_{3iya3+yq`y1A)AgTZsgk<_Cs*%) zX9umL{=$d90%w3zT!;$($Jf9!(yzz)U2d>J`I}@$)uRFXO5FGZ&;OqCy8@ggo8XEqxL|ahDrl4xaX!db;0?cGv*-Dj z$p0%`=RU~IJ3DmXUI1?``J$iC)KkmASq=*K^BXt0j`|s})Dzam$*;@jU*KSVft#aT zo%SX03Rvn13sdES&~@M(cQp*qnb;)@yy$?0@H)O`1eoDq;0Qsdce&5p)ef6D_)`WZ zxLILLK+j#(WQB#^13w`!&Pn+(V$xKl^Wa<0+q2Ig*91&)Ctk$1r7v6_lpOMWKQP6~ z@?q|6+vEAavQTy4$LCrIJ%5z_{hdReALZf4I4c^nY4I!fpTvheKOk!o`Z!_fJiu?- zymAQrCpc+8LLJ!T_Pnobo;dI`3`}t|!~mnoDiz4l=Pz;7#uNphK?Sl(`X9!B0oRI* zvmR4qr;F><`g-{Jhq%^YQgSiQef&aDs`QZO3&127nhrA`TjBX#q3&q<&k&EIpEl6m z|7%a}tLL8*`juID{Ki84+L8P}Nkp_E8L!>s@}=*GgNg-&lXFKHvgu`v>jD}KqL+{4pFZlZDK0xP{c{5S zInDd;BmG?G)UnP#222rDa1ijBzjp86Nq?{VJA8Jm^|7E^Wqa3Qs!u;9`d)eIJKR71 z^Eqm-5+i_p9w9mMVUG`&seNnw{pK-#yLAl%%;cuh`dB;W^C1gbR!`Yy zcUtA7R&<=_J*i<7Wpbu+qVtz!sdmH$z{=s;&wY%4^0JxZWb`k}`$tT$RF40AAm`7~ z200cw7XOrGuTd&p<3~U2K2}L7uz+NZ+sFFk$O6_0TRAEJ_i@ulO8(hBCjWq~5G9sm z8odAGouc{BOmyGLr~){$sI#y?mav z^U8_zXOX{OVCP}wMEWPl>dyw>zX$!j)^51?m|xv-Q?_oSah$8|c-U%N$Gz8%k4=64 z;8%FrUV#(;cwTzke&qK9nHc&J`ukGfk4{Q|NNZ#B$Ifqij~XXJEbl(QHC>$e{K0>R ze*aziL;BlJe0Wa=9itzn-~WI7?@4pqk6a$4Pe1bCIE=f5bQ6JtqIE{OV}x_A%-2iTs%U*`@IA9r?W7zw*y60XB|H z|31>sV~4(b|D)+2$SzDXAQhhfX#LsG^Owkc#(De4;r}A_?*LFhi2sdctFN3-`F@Bu zIRu;{=)2X4YwQwOCg^u>H5>v{z-eGn5To}vmF&`7D~IIRJg!onBkof_P>|t}55O|8 zcF6hrfLUN(iew)9UAKVG1bsg2{ENT>Fb7OXjo~X1qgM{mVLmX)RoFA!y>G)IpE=NY zSekj5^q*pmJ})`$j|9E$Ertc+%1v`-V30MI?<^93KAip&z#MQImm$6rryNso?;a;wtTF3V$p$!e!v2bw_r8d|-lb=hJkcQWj5FILvogjiV2krv4ct zbX_btdUyT<>2LfyDPuQPg4Qn`M}F<{$Ed=p(rEYL5YT!6{lF}+04&G?a?8*(NLi`VU&|iQz>@w)xXfg?$2d)Aafq5bVc+{RN zzz5(Z@U&;4y9mq!SAgrl1+L%r*R}N>PoBFfBCEyynOhy<4M8F2{kw5+% z_}tSJjsO<~G<=xaGh(UwBb%(>ce}Fg{8xe7z(pFMkX=Hnz&rka@AC)19B`eW-Ln*+ zh%GW7fzQC(gSwm?{VxGGfD6DhE1WKhblp$yfH5}7cb>lwI0IZ^i_{36@SF@uyaFBr zFAh5YDc~Y-4Y&j>5O}Z7E{AXIG<@0J&dUAM*nGUhjGSYrq8xz(`hr z-viHq$H42JI>-ZNfh&X!o~8Q;vY?yhehR!ih<}EG3&3^YJTM203I97E>J#uBc+#^` z&dz_43Uoj=C~mS%`62M8=lLD|o&22R?)?IN7}tIB0(inkw;tyo2hIXlfU`to7$#_X zg-zm54{G!)QGRa%7X-L5r4IVWHp%Bbb$AGz1}>3#tjPjE%r?i@(k0Y`e@1!!OTa9h zXcq+SvrYL4@T4bbeHmCJ^DYv;qCf@yoh^p0%cnR0yZ67!7CCd`n`~5i!4bLM@;d^Y z1FivAfwRCkg|EgVyak>E&w)?f{9gnX`TL86Kk#UMwt0Vl1Fw7b$#}pF^~XhEuGM81 zVwN&EPf!Qfy7RH)9VUURz+HlN&+-COW^tz2{oq*gt?HlxaE6$_R|$QUV}+^0PPtR; zJ9yg7L(>Al^@EDQS;8jYB5+|| zy4AqZl-va+$k*x+U2$&F9jkl^po%}u^?%_CN&jMfGC8fp*Y5255 zem%l2|3u){L0S)=eJb-*e(pv;cf9{FamVfg=cP^B)?T=n*22;y^}(I~ybnAe{|^)Y zsmkZKfoXE&%bxsy1Gop=Bny^+*tuYNPEZ!8)%H0$)A|em=LwoT&jh%=YryFVn}c;K z;k`XSP0;n5)bb(gNTt?E+KVU>fj)12hJdrcE#MAtp1yNlFc)kQK!Ng9)m%gwI1Ah- z@ZONY3CPY(k5bj3NWZt+p9Agy{{h^f_IJK($_czdf?O0B6V%iR83oP(ce1$YMcMY~ zqT?DcMt$7S^Y;k@TqGKmDNxrR)6edcB6sM#H5r;E_g(=;vl})-R&`ueBxT3KOFjQM zaEZX1^PJEC$4yGiN?T&m*^qDcdoh`dr11rh;1Fe5Wq4dA&d;Bnw0@BsK3I73v|26aP8n!d`ciq$&!=NuKdY0rKU z;XL;M3^1&7edTTX-v<5@xC_h?biN_$%gPc>I^lh(^dA8(l7CJ~^2~_q5&8d)g9rF7R!Vc$T&xO0fGQGDloAxsDI{{n-ZUZ-HK#JKt zCDyb9X$#5lzcZk}0#_ycYx2OcAn01tTYFY)aQ&?X1TpzyKY>O~aiYWb(@SUM1OfX@k!JS?VGSs@tZ33&0$31Goj8=X_WoJFY-WQes)N zV(Vt@{1s8@w))}N&oZA+~g(s74hYZ zz!U(W&Xn%5@y`@+5x4@J28QXi+Wp7W;bZhCwq!eGhJP;60kE6*1Z~?4*e18_ZaK4d z2Dk}aBmF_b*vPBT-@hd5#Xs2jUGnV)aGs@~B40k`wGWbo0#4rS82^j`7lB*A1;Ta# zD+tv0`?4e^<^7M6u5RvZ-SLmf?@huECjyoV`fz#LuzhZ;{bc-eikX{PDmk+)p1l7k zuqd<7J7z;BfJ;msPcr=QWkH~0fdOEO*^qUs$@c>bz;z~{%Gp(?=EM5@nSI{eXFLBK za22=?oMDW^3}Jx(OEApMq247ASDkgg1pETr1!kx~Ji5OPMj;*-7J7`xoD5@{!QUB! zAuGt zyk(&6r-A#xPrzA*$98gG6i_NlWsN`jjjFm~{SeybhyzYp9776d!e;*Ap&}@7{CUms&lGTr$)DjYMhCLLoTEK(y9-+UGsf`$Ixs~8!tmK|W0?CK;1~r? zDd={YxW|Lk4+Vy>Hf7W#J@ddIaLUTuAwi%g=-s7s+#^{$9?9O+A=ap!s&92JoEG;f zCtm@6MM{-6D07JI=NAMwIu+7+S#Kf_i~w^i6cwCbVT0BAETfnQl+dRv0Mo3}xHJg{ zfl0PKUEnSH41$K)X)524|0H2x)6x^*|C=ss0fG}vRqm~a`drIlP{q@A=P;>+7IdS zrP(D~U@puRDOw7lj~Q>DI@GMEO`+u^FVx2--4KY}c>7d#-L>hERM71_{%ekHO2s1f zN65OC`AOxr(=1V3AU|sS^7ZXxEDK{HYS4`Y1Vy%L_jVP zS26x6VsC^c&zv|KzIgDS{1fmMT%fh%0e!$I3xB4AA&U?F?4R>otnRw}V`tDNS*VEk zzmZVRv|1{=;fH}@5zP^5(_$80$Oe%>;-%!5QF?2wuJ;L69lb2Jk40^u0SkC#qgLN` zI(bZXg$MEgSci{kQe&#HA)t^)G(gwI7j7{>+UH#~d9S8FN{~hA1FSlyBz}Wyqyt&P z=d-+GQ#w(J9lNGs&D~WMXpG=&fO9~K1?&w_nT1)hjl z6i5)Zx?XeQoEBKnYmFI&S$r$Y6Lr<>*v@_L9n_y{%T3KK$=6AHy^7c zg%7XG@k^+HV)B#z{lpAdD#*d+H6-#yAaCSsmOGS@a1&j7xG51~C>q|v;r5#Gg1Af; z^P1{5`Zf9TB|4W;o>S$1HqQ0rERaNm&&9YU=fU-7*W$$Tv%Wyh#F9=itJqRVIXRvu z&{L@-kOL}~m{;fwepKLLLNU6KATE*r5;GCl+GSvgiIfM6L~e8jHqSHGB_L|uN|^%e zSitz#V<=}~4VaJavmT?ItjUMOJ9dj&k8t;5ByZEdUrfKTa9JE*UvyK6SKi)~UzQAQ zlKnN4Jgq93k&la1NTDojHK!>jK3pQo#JR;`NdC1Rksxbse9@;?3~bj#^MW)|5bG6) z0#}zH&Yr>p_0f05UK!G%YYOEiyS&F*iChGdeUeEig4Y zFfgfZ{=)zO03~!qSaf7zbY(hiZ)9m^c>ppnF*GeOF)cDUR5CLW1n7f~4}t`+1Ic@1!sFb+rKa|6k=lYP0WV$o-#M2Lb?;bpO8z07WIt06+{tPfPtp z=vo7IG@E0!b2k}_X`hw5hm7LSh}!*e{likb$|NWEX&!@8=95v6qSw`h{8ENb7D2-V zhN7*4ncAs2EJB5Pmz$H+YW(33@2B+p8SOOZWU+&bK9ijT68DaLC820)m*Kq!?p_BNX2~ zjKj0zcu^z%z?z5ggfUb$GjUXk{=~I_w_A@fmIxTMO*WDSFOIQAC`=5|14d@z6<p3!diqnPZ{bPtgwb(ROo(YXEIyg1nfyW$7$gX%`dC2(^w1>wxP>RevtUB<5*BVhrs5|3xX zxFaO;S8n9CK(Sydk(n8#d2C{ohG{K(FdR>LXVWR71_(~N`_LZ6>c*Yiv{Je$8^ehN zcxZ5y4&4Qp55o|y7kw96Qk$0%G^~ zX*BhAFnb8RmD^q!6witScJnK0`UKpf+B4i+h#*ir1CCGGXYd*^ zJ&Y$gQ)cQq%=#&9@|Nn(voTv^EEYA z#`YJg)}V|QouQezZc}kO^;x8F^tnA?AzyiWKP;93i|o<>A*-5_5^1w|M-PjJmr6AD z^_eNMhmFFm8w}9q63q^i{mR+YG{|9sq~pYX(DE|rMQbY+DQ|KJ5V>0Dz&z}ZR=I)= zswzxZ5Y~K z9PDiH0W#`CPFnGW=&z$&q=q9CtRPoNn3?IdZRspcZXwgD#5<`^Qw8157!jS2oAvP5 zdWI$Mq)3%Q$inj;YXa`uJeO!2T67caI0)hR!E$>L`&vEf`0s56BAq=V-dz6rQqaAIJBz2N`}v4_B(KdEBs&*wSut2%DR|`F zQr?{WaeE|)s3PNUGiWVL*)(aRe}1X&@%g8K+tQB)v*kQ$9a4O`pO1>YAKh-A4$!iz zFRRLN5z%eiF@;}PpTEvReu9#h47Nvh6>V-Td)@ASxF&%U5yuG325R?iQctFtQ=OIQo zWJ57Kr$id{9Y0~ADglv1cs$(wuq{CF%oyX9avWRX_fz)ra*^mby6e%|bfJ*;!{gt( z)FpJ`fyT{1whIH?T~~U9>1N!_cwz7^HH-8<#OgO`cMBqN63Fz z*YFFdfo>y}aG_RBa;ypL(MIITuZ)6Tla)j|`GmIn<#h6SbgNRmWc;INdO~VS!GN{o z`BsrsEKR`H#&moTF13kfl;e!Y8 z6Uw$!l9haFIiT-Lp!9ojCDWL{uLeFE5=erX%>MK~@0pn#B2lC8uS|1nJSJP3OchOw z;6-Os9cwZj&UBXiRI6OJu;}AQt{ExLC#zT``*c(3y9T|sIbN_tggwc`@w%#OAbx0g@74RL0>GLKLFeBEcaUG<$*4Vz>3C&93rs!h^|Q%k(Z$A%jJbZ+_mHCF+_ z*U%ZOv?2qI@EZpN#NEis_`%|1*f(F(7dd+S`~<2QG7UnC(2|vmTggv+=;?Och$JtS zM4GcRxl~VcL<^a~!Mp~}*y)$4Sn6>Na_Q5GW`3FGN-WVuOzM8+#9%MVKC6swg+YZD zmo&oAG@{1yA{URu>XGD6gr~XFYTTc$4#1)ckl4T)W$ufA=OG;F<9Jo}-?2zmr``%@ z>of!5462pq;A>~ftIy!S>TflVit5u^K2bpd)J}B4@b(5w4oAzEXC!!?QyT2Q{67rG zdIWfs*|JEW7Htjsm)pgp`?SrtLw@F}%X%n99dhy9J zRt8fB6UHdE2=KDTS`&||Q2Kj5=;m078yeob4{y_BuOBY^wsv;G3;0)DYa{J4({5*^ zF%4XabEyT*O3whD)lBV_KOdWty0-w2`~mUo)NmDV!Ia+y1iU%>?`cQhK0@nKWB@R~ zOE^TE($ZXD{orBz@U+%oEW`Q{V?u)FT>c7dZEb`Q+;^%XMkIrne zmwH>5GE`?8C3N`g0c8qYt?^pEA^C7&l2Bl(Mq=>gMZ6sbmOJb9)aHjIec%4RD=&^~ zC~rw(x%fu`?bdYRjEI6L?+q*wxWbsW@F9_zSDM%P3#4bs#j+sS%`U}5_P-YTHz)FV zW7p^DK4(Jj#TdsD1~^$-?5*O1RDRBfUA78r0m2Km_n_O~97&s>nG0IuIaX%E%`dxb zEmmH7Yb!mtvX=vVZxPRXwUf){I?lkz*B3@ZmWtQC>but|Li+Hz@^(l zRC1b)9@ztQ^N_GNDImz1W~n-xBM`&8%*I!|F!+t)6#noJnAq0l$H^n)mqLz>(3GXXpW24nd*oP4XW-R5mW=1MqltYV z!^f%lQTpd6=@$?(B?Rjxl(FRZnfFuVVD8a->my%4 zx)Yr&74)#Cy^t?{{=XlIaxMQjhxz*!zn0`#&*Z2GRb$x@Q!uXcHvXnpo!;WqqstD- z%P+T5@Tb~fS)mjPS|bp~Xi#<=PuiCVV+OW!x<`O(zP)0$y+J~~wR z)Z@|!X|Yr5PHg#Zyr=x7ITwruCs5p1^-#9=^I+`0a3JkDWC#77$j_M}*f8P9W%jKj zviV&iAi0IWE8wd!u?x^vJE4?3YUvYRAmB@I?!DAG^`X#N%T{qL;-mY+&MjbHC!bng zi**z6r4tPBwjFmAm!k$AW1$^M4{5R8!WaG6(AP34*cb9&%Oem!@QcfdS4(W&)-}DP zmxTlS%)B-;@D--^+3XTmOoL_Sp+rh2Y7NC0GN-+JIF|#O%d6!U4jMsXFTcyKT*fp> zK_dCMYXrq+GFgx?U?Yc)gIrCNhx`PRi3B>{w=NKKCB4nVzWZG#@Fj}(S%f-1dR3Ha zEPyE}=VSXdL&HzPl4k^6V}+>SY~kl_WSe^H$;gnqni-5BfBtX^N(D`wxiEAkpu^T9 zZ0s&Mp_Ru<{|@x!HsXP`yAA9O+{*9ZZ^p|jweVh{jeLR&kCjM#WG+T zqMnqe+?#lc((i=qw8$qGh$0;5#=u$rkPZT zI=;5`9Vx8iBwt{f>&6>D7XI9x|=OF&$>%{j?e>T3vzHri|Ua%kmTTBA91pK`k zuEItAK0m^*50+K6()}}xQtA0x{})|MwQpPaIfY0$e_;`Ox^r?Za*pp!0~L~NnI!$m z$E2}mWW19Li2uI(_NrP#54AZU9ZI6afQ{n4QE4sm*ZG9czHvC%i%u-tGPiQ7{wCdW zPk-#uPmpuncNr7|jv$(bhUZ|6&@PT|(In6oTY5ghhG375b9A0$DHu=|9jcrQ?P$H> z6b!nhh>q1oC=v^*RFbr{+@pMEql1551LA~wp%|j$U_2 zIqBL6Bp?WW8QXKi(#JczQAB6Xl*Nn#daUxAmhOH#n{?R@o` z>xaRc5zMlhF?0AeNixbq4w?}VtF>wQ<=FP#@PXfe^2BR8z4qr?xR$D0uAym!((&+Q zAvvtUhwzK-N*QunK{*cQT*U*LU%Q>9e;c9ioj{`(9sTJk{>j8Z!(j8u@PnPbZ7-EL zaEt&0ZW?N59mD z>Bp!oUznfD$2=@TXHM@xMa+9J?&Qq*}qYZ zF#S+sW4K{r^hg=Y_q(I!00&lmV;}W?T#sCfbOA~+3`6w=TJIAq%R0gjf4^}r1lVPC(q)*t6M4k>qU?T?-6CPR;2JN>$(JFVgK5npS zQMSQbn;)taZdmT_lVB*);pQI3Gmc#)=C;0Cuq?vY_zf59Y4UR}eXFkOwyp#3L1`3i zbiy(N`pE4`QC|3o?JhjmH~L^nU_ZT5bKw&O_9(WpkJu@D2f}ybqqgH{oPdMeSw`6x zd~-^UtVGVXzX~v(&>y_L=b(eSkCdy{5EF6LB~<21*ND!-7|+07ulMD#Aw{l3vp5`SU%n``Rs4bUY0H;XBP4ReKW9 z3(O%H)Jy3Hr^Vcx76z!ca8A@*PVNM3m~CE%pv>3g5Vuu-ry_p#Aqpr2rr}JXg!TPs zBk+7yr%JtE&F?b>3?U?q5&;e8WAoHH_l@1P zGeiPi+i)Z34W{UK9Fn*z;bbBv==R^v#`Nzl4kMg>61I55eAQR3tTjJo2jB1@t?BTk zb`L=&-4YEyP{bdJ;iFMs&q4Wg>#cM8miuqMt{i-u@vG+nC7RO|J+n=`_co_Cn*SPq zDUY>3@DDr|p?)Yv*KIBTb$G?>JzF`tcIW zL3C)LiRd56kppwzG#~mcj2&vsLCb%aXdPp{?V%xa+;USH%S7>RvtzOQe z6PVdK`WyT2?6Juxa=-bk=_nFho$t+eWQZ3}bkfKMySmdSyp{P76M_vx-OL*%jiVC% z8}hX!#Io+FmN4e>4aqT^@lY^l{Q%#el1+Tm`DI0n3F^1Gx(Xw@&kTXze~G5N#n#Cx zu?vSrh!T8}%4i+8-ruQPchg9HoQx&rp(6_(Asc zq1Gz_w4A_PdQdOd_}L26TTr3^f~~<~0}sfE{h@rw_+9w%j~D?2vxDI_9%@yfkBm>$ zBLFAE;0)fz)u6dcm5w8huS4QLjYcD^F5v42>6htW|J0or>m#gA){}SK`+TnmO9xyO zB(WEpTs0R4|C-p&qf4LS1ZH3|yV``k5?I^Bd<0dg{X@Ko(nM~XA_Mu64dnecL=s)) zwFS)q{pkzl#O7XtOuk=qY9YQ2CevI%s;&iY94*fgmY*fie^1kw4iId{9nlgboRER< z7uz5-A|A&p!~P?mz!(iu2b0y{iTEbciA(EU(6$EEpv_ZQnSt*9wD$SD9R0sRT41#4 zmF$7PFmlJB_ahv*u*KZ;Rji*}Mc-LDzT?dW*<1K0w{OP-VpdC?^&C53hr#vK{X*4n z`$l0QxTA@bvbLmOV(EHGAjg1h5ZZ4bi?FD7Oh95wp~1KsLV46Vy+=B+-aoAphRFWR zBE+=4cwu86g0+IDdA{msG zrXvE;Rc6~+xRp${^9K$NP(0lz_*s`@thOt!DQ=IiFQ(%RA$A5f#DEgmz6!VsV^!jr z_QL#9abiDl$^M3yzwO>kbTW7fI3^?3f^F;*UpG2R@p=g_(ERe_g!al(a6w%^2a+xE z=Og!cZc+orsqNx~STR44+^gUIY^}F_TQ{GIu86@tm?$P#G3}}#b~>gN81t!)3DmPS zzHXR4^k_%BTwfDYA7ICyk(QEYEjhXGe{uV%g{H}I{F9)4gf&*SyWUNoyQPewZV>V5 zAic;lTAjbW)?Err!x6v8ys$P^R}Nq9TBwf@$J3Z|tbdF1@Qhl&d5BHLkD|rui)1s9 zK6N1&I~GYoYdPafZGv`vt_Dh}R<=lx zdo3a`FE;Uk21KYMPj}E`EZJA;+Wrzg51|M)QRPnv9ASC$hK8qs!9GKTZe`Fl54x+t z{*8UxmMO*VCz~kHLwxfiyb0apXOp_c+vC3LkZN%99KS|>F6wqk^&KJ#RwQm1Y zfwL3DAKXgl<8H?)e9cO3f5Z|9O(mlj=5i*b%d<7)J-}mksbPb7M{q;L~gu1@*;T$w4>UYjLJTXeMa{Z?O*H-$?5b9 zy@F+|5FN*@xbYZ~r}%$(5+??9TK91Y+ zP5+1ko2`X0jlwy(de`_PGt*RNcPnV-9W9kc`l$S8z+VE^bw`awoh=6gm1nOT`XG(@ zwI9=(9z53NdQqBzUodL*=G>1(?p2{c^`6@eNfMv-X}R*s@(%DhSE}p>Sgwo>LQlzN zhk_N=2cjq|^d1S68?VcigGa)J{(^07m#^DLj)yRQ9y_(4AsqU(QE?d2oURbe(l_Mc-On?J z@cJtZpNa&ZV+$!NKaih7P_#N=eatD=`R4ViC>%qCk(fXI^u^@}|K>=}2O|N&^X^Yq z$H~mYFqO9>sAXB)H7*7tXNbe6U_xbjuJ!Jydm5sJn-@6u&TNI8Q<^#aD2f5CMC!n$ z3;6-6$3^6Y$3Oc=IN+2tU^vhtF`fkY_g9CMmo57PwjJh&KU}AB`Tjzc=gn~v1-5@4 z8Ht|WGa0%P2I|;{{5=IF<0S7Fb{uWfrIZfiZNkbT9auR6Rk3ft?Y6{GS+g`}AkIy} zc^KQA;|=cOz%MVt{`fB_4li@0DC&(Bn03UtDEILYsvK#^hhTplq#KRk1SM{{AN<;> z1@+?p`nqLdMyR)2S(&mA{4jWJu$Su%_CEffXTd$4HV-4xSM*#z@ik#`SmQaE{|t}z zO0g*375AwYviEm7A%uo&UGlUQOpAL(*AHsD92c(yrva8>dMJfXkJZqcy3Z(&XS^21 zIh6V`VDpMeB(FBV;5ilY<{`%xL?-ufx_O`p$B*CO*QaO zpLl7_O#M8U7TEdzl_FxfcS05<2r@i?nb)b+)YL(nX>!Q(;8}V8dWO0t)kaYdm`06z z!+tK|XZW>_9Srd81tEHX(${4pZmv4JVMqx|aMSOY&zCf`qFcs8b!AD#2)>PY29}uE z$=Z~>Z=+HR3FH1WCwc(4Cv_2I4&xQ;s>Vs4EyUMw?hW%|2Ses_hd-^@)W!d_@Ifec z)#PFutey60emEW%rnnKK?@y^3|Gnj<_LNG{QGd+!p0fujrN8dAV|$teTy}c^m>a_be|aNaw*oAL?tUB7$h<16Ds)Nt_uFd zn}<>dU4|K#gU;}(8}Ujl4S8D}LWsL#zU<~lpfl`ysq$LSpwdL9!|I|5yh1lOZC@Q7 zY7Pyt_`8oD_6%utkn=fXygcXi$&MnpAvvE<(D!FM8T9F{6t8k2pZ5CVRkN;rjt86! zm=}-`1N`8*yXuC!cq+T(sKy7ILoLn+2O8VF~(f0Mm32eGdJ-Ify9sJJGT;fz-bT_8qYS*-*A8_4(92P~O`CuB%oV8=?5x@+K!ktDDorlr~vo|K~iK~hApU6PkS z1LvR|;=Zrf)lDWl{`y^t5*>;B_MR=ubWN#Bbb{K>L~ZFpFDTk@$fwcqoM7%j>DQwW zBQxdAkK@LazJr#C59|f6upe{q-$lV+B4RPHc!|3_&BRpxT4!J>|4;dj%OE3n23(1j zk2(k0MYH;AzQ0!6G_w~f_X+yv#`GI}?!XD{@^`e?k?CNO7%L06Ufs2;xZ~^qU7rz! zB`f1-T%VLX-!|yrv5)s6Sr$a~aaosVqnVpTIW2HWKyH7oEHcgAu zZCaM*pA6hC-2wQXVrz&dx6*!zw`dk-m|#v z&gFpBeXfKRs^;D$tgDXkcS9Wi|5^lyc zZ%reBc9y`3rbe3gYl?wi(zFh2VyCy1Q0f#jE!8*Q)=nrVpES>b$hx+?{&4wQK24rm z$~4&f^Y7z)88M2C0Ku1ti2Ao!)Ri^(OCDP`(1jq1B*;(bD3pANwy22d(Yx-bF2lAjF41%eSjhcirTfl!?Jo4$S z7oHK_h-==8K-tlH=r^8cW*)3jBgxORQDp#Bk!nR%(_fM>l3IqFQMj;j3}~p!Utgs% z(eyesA+M9WpJNZtvk{k!5L3<}lv}I`)fP8FVhu9Nju0{!Z9t3YgSjFJp}Op%Xjx&T zV=Ng$07bVEuU*W#6z|olo5<0lsE*4fx&&@4671K-g<3l;SZb=};(fY>&>%BZ@oQoO z$BA^lhA})}Hf(yr4oJ-DbJSh*5Z|V8y8UA#i@;E5m|G z7%i(mJdHw8_s6dX;Ij#i^`dP9VjMnRfN4+e;rm71ojy9Vh!6EWV6FYz(;{n^ECG)| zJ$q!J&Xu>LHf5{l@*7`4P>^?e{lTL7GZ{_!=T;B1r=6D0Y8)1Dapjolts=$T-Jtb2 zZ}8a2Y2I`y7lBm)EUaU-<4JNoKgq^S_aZHgdv%oK08tO5P-j|Jv@EhF(uy3)Dis48 z?D8jLT&!X&-ZRrHuYZMh{+ViUGLm13&q`qYQ$__(J>U9VdISdt!2F7P7awWPcOJMy zT48PE%W#57@-iX(OkVf0jBb6S&m*Nbo7F^6!_bE`yYI9fW&=x@3Qp?xB66Ol!Y?U9 z0dgd2iMuD&t*^WlmA(i%5m{`jf}4Dn%pOeVg%Ktz9yd!*;fWu@hE)FLr13;*Vvm3w zQCz(=QUXX6X@3mE<;u>CMe)mLdmNtM9dhycxqTSB6Gwv&LpAc$FS3=JP!A~y;dc^` z9-|%|+Rkk?jH3~GGEU*Scvr2*{!EQxI^ehz zxmUBgR=Sb0@1d|j0OGw#-`>$g8Ia1`Kv_+P?V6SyTp>Oj;zRC;+`;Q{;Xz$U;Lz14 zqYV5EtXgCFa#GNFu00C6o~uH}jdvU*3IhR!PSch^W-6Y9chS~6g@^=ZGWG#|!e~Z< z?fyKCHa*1m_k$A4^q&zdlWb+qtdirF=l?9j*-%O-i@La8$v!s$$M)^{e->YzFA#;1 z$q^dU%nmEJTIGP01G%`SQ;k1tE4EaD=nDI_{18G^9{=v;sLDWV*>=P)E2IFzom6Ma`?0 zIzDCkBJ6lj^lZr+Fc*EtbK-Df$}c+Idn28KFg5kO6uP;6418sp;B}Y1n#C8!@>xK^(YtQ_ zGy!)ZE8o&$61+lH*+zkdiQwN3Q9M-FwrAzgTcpK~pZ|a)DaCDmc|d%@CiU<;15P3Z z#{~~e1c{jyBOa0d!e?UIx1|4o??g8_L2)ja)0_UT@ZQ_fnd|9KRiGs~JHNy~HpmDDHGCX+}VV21G?PlS#|=65ig9C+n_nrs+;8 z{cnF%g+3;o7)}~V2_|wJ$9k{!y-67GVJQc4ypewplbrX%Q4*P6>d?TAek$vcfdX^k zsBuOodpUJY>oG~hq3hFBO#@;m^??u%u5fpXmql}5n-mGJ*r6JLQ{lxV1yycB!}pp= ziz|8CdwvsW8ShmhLFq7P#0ddE8e(?&T6-VX`E&ZRAz|S5oInS8CuDnilg{^ThM?Vi zO3?RLg|Rf^K8`y2`LUe2s&Au39>h!t@f4SRX)sz_Pob<4RK@+HN%U7N$}s_RCvM{< zdzhu$^NzEJ1IIqZboBts4FXeMk1bIWJnRhG*W%#g1(1_%d-a^sMspL*EZ7(=!J6+x zVj-+13$i;OT!X1#*!oirnV z%j>SdboN2E!Q~WNjze)V+*p&S8yhrJM`!8FvKnVp>ZfUitg=(M-xk}JyIP2wnKG>z zAxP)^;=|JE*PbJpKeEqdxwj>@EFpOSB*lj5_J945?;s(Ho=Un_}vrg}bLH0>$+0L$>C+?9gZ`bzKs^})Saa-$mlAN%*& z9mK3Gj`rg`KH7^W{u}hH*E>lh+|tqTOEi^9b3M<;vlgh#{1QKD!9}d{G?=M zjeP?=8RB55PDVCE4Rcd#CN_{0Q4RH@r@b$SAK*`g&p(MMeLRB|1%!1ixJa^LjL&XW zZp15o^xT}t6eQLA8wZE~u35mmbu4TXQu5*L%gboz>m~_#k$-$chnPPiC|I1#R7e#GMlKMSi}>qSQR$wS6@gE_-0)W}Axxw>s#}Sdw`!!?hw`fl z$D(AY+YR1P6mu1Jc5l2WUQQdw`z&${237REf4uX+VxdZE4z|$u)-3EoL8-a20$Nu$ zDVBekoiB18XdBcl6d0w9s2j+6vLn3*{xTeNwDDd=pPR%`Mh|8YZ^?KLCO`Rt*yy}u z%h7(=F=}U-)K?9WyX=y7{hnJ);CX;kdY6@a9--$WWkzaCl&habka`Lce>`vi`P}T8 zn=fWyN$2LIeGcZ^Z%(Z*4XdV#B~tNVY|^uGo1dH*e8)cW{6~_Q3;#dAgXF4yS$gOr z#!R&5Z%5zNyqO}vzD192Qr=2{_BTx}j)F&>mpFa@$*InlMg{#I7eSHNyt~95i-M|L zUsW47VG27OfsIyzPQP6S%e@}tW)k#N{Em8IDaYZl4>3b;v2hbgA4WMyHOm3K%kKm& zh0%sJl`McfRv42Zfb8AUcXU8bmg8eL8UiQn#`pTd9K^^(diwMDENZRnqZ!dZdDcGb zCQON8lEAt#AN44anOoKk^Mi+x)3L9)vf@~*k7%(n1@R1*tHbD!C$AoL61XQV3C&AZ zgDxGCuVZ3;i8+BR(4)9si{I0H-Odpp8&@H_O0TV6$Cw^09)~qkA>vM_Asuc zXw@bB*EP4bn!y#S|Dfg8KJ->^u|KM%zD-+>O`dvsVj20sd^dXfQk=IYSJAAVZy}98 zIh(*MT5{0R9?z#Zrl0toa!}M9u$a1VJ)g<6+Ad<)qTO_!+g#r!s~jZ99IS4Hdh-(k zb^tqMoG1L8^kEBpx!2I}(@tJ?0I&Z{iYCF4WAT|+98|%F-i93WIt4G{diu>zQ-_DP z#46NmDM?3A&0j*KER^njtTT$C@H^)_%BJ_RldUsLA0kFS+CTRCmKYjto!LC19HpRo zLLm-iV3}>u`3A`ECW@H&W|RnmUujvhP*<>@a&`&wKr8!at&*_vI~+nnj2E_;n$QQW@m?+@mpo7KP{Y zyeoPyAKp)WG8W&zu1Zkp)KqKoqMNNYI~$xALN5IO0RRE5ZiXhdD~(g_Tc6*0pUkr zV4zdLr{8oQ5q!vyXR3>FpDr1svh)8U9znjDwUj=#GDSt|ML~jycqYP0zR7JLUEQy7 zG)=9|n4qsQ+KO3n*5jKyn1N{*o+K5Nq!(sB6u5afRP#HHXSMo%YLNm*;@V8+x9xuT*;masr$Ifd{_ zJb${vjc0P9k(Yg^gBV0g(|Cb8aJ0AeU2s^Zi~Lc~=jVs~YV%k9ZrLY&?Ond76I8z_ z<`a^Pu0J(1$B-!T2}`s1uG~_J{i@8883IWZ7?2>9(7e2*yME$2b(P#`&4~8_E2or? zGhR$<8%Ma+Hp&9Uy|Mpta~F6VJ;LE4tAa0XUntxu<`?a_uvmHQv> zl{kJO8cl@%L$^tQtT)5$JMsXQ8HL5yQi6jq=I6$-@-l%ML`ihxx)>Uyn8B$%?Mnss z%L_CA93esL7ug00VC%v>!T1T!0#wcgcq&0)BV+JIH-COZh2@lb+o+Q|wU!kmNvxEu zrFT2Z8p*|jDa>q4u_4(_ll8%Dw)>c{67KiqaC>m>7e_G+a!%LjLuRX)XwR~&=Gv;7 zOoS_6b8$Jni9>jsP>Z^b3L*1@L>i`680#zTlMpT3QKK(wFz)uu-@eHH)Twe^qo4Yy z392C%eCKeFeX+ ziS2vtzA@nwiED|XPnvW^R}YaXHw91V>|5J*IvK9xDY9{S-|l+Rw7e;6h9IpyqrnMU ztXZ&;f0Lb~ritccJDC_FZ5CE0JiT}M)!n4#-kT5`j913c{mVB)tpU=^QrN}eQSar?<8Oltql_k-M$q>p(V zNnSyZT{h1pDm?7R`(=^1qEAywId~zu{*CflXKE{FuEhtGMFU{Jh(OwjU&reDL!LV4 z5T`8({-4p>;??4eRBX1z{OP3-cIJeI$u=EW**4yLiG3`^iGt`kz9ba?6j$rh8scjd z85ud>mtOSl@w_v1eTipPPId2rk^%b+Hn0-XRlZ6S_zb?sh+_ElzDAx2c85SA%D5^m zPkS{<$5fV8G?{w1z3#bLiHHHCa;oX?7z53=*%=Q+o~tjR-bL%}hZT7?%KMj_NyK0L z45ocpG`YwuOuuM?4jXR@mY=`T80?`SnnQq|1O+~gY{bkI&A5oSUINYFf)DA7iW$hC zHXnppZqEP=a?`znX=LRx?yQo16=cyMyX0Sv!>+bFcM zcK+(d=vGy;-8Uu+3lonW`5o(Fd)ea9#2IF+D__nH3Tq$vz{WwtJ_PjDdsP#MvrcR_ z9A9yHzb04KUvcl%zFOC3FhRey^^IhHudlFKC4_H$CL_yr3B2?`;Wb0C$;xZuKux*FiACNFQNG* zMlh?ik3E0n>mo;t@PDea3zZc#n9n^{j+her9oyy!WVV^l7EATvwV`n>en z?8C~djLh=i=7Bj|XL;G-g_#WW_3O>7+m`|LmOc&pR@rtgFsUa7;;FQ;GUNZG3qK-0 z=56-2qBU<)4cE1;A07T`D}pRJtX^||m?_NLD)Z7B2~}Qz(aM{*LJE|-4W}pBI2r@` zerhw-{|#!)IrvZEJbw_w@L%XFRrW>wGj$Ia=AS7}=Tff=%-_70P{{S+l{W~s@7IUD zl6s4X;6aZvb1!WH$WUuXQys?NRB8ypu|1pTY>?Fps-DL`^?+mV&uSCA zjyu2BD*l%v=0Ew?`4K)ylb>uYHfN*?BmEalnXyM{_Eukw;5Wm`#m={tF>&~ zE+5_vWI}D^0>b!Mg(j+iRBxPuex5JAb8rfEe6P^L6=d0Ju?tgMnTg*pw4Q7SNjvSc zDeo5XXQ1RNVU0Q0eG7c+M0cI>-{(nS7JruSizNvYYRwgX#~wTj4-t!O78|qV*Iyh+ zp<5@FRulc`5k-nPUh1!ppk8iC0mqXvEtzR^_9XW@~{!S~DfC@DYOukY@w_{)paeKXZnAF+@p z5@_9q!$j}P@1=2(iG#q`V9!5an;#9Gylxj_{@@aWC*5GqP%^wzv&kj1x zzRZnwX`UWYgybskGllXFS$3(>!B)SaHe#f%FB)u7v~$ZN6VBC9QtSAIwp=Q+ce`(Z zB3PBPDl^%rpUk8{QJJi_U&>0~-m$xW{fKQ{{#7Hu%KJsoyg<8mX5`PmB5@g5|c?X7g(b4(1K*^vKds8&~wN2r`LZmJN;S^+bQ5ns%EVZQEl8rj#P5xX^&gE z7bk-#5JE^hf~d?3sc`MVh8bTBEq?k5m*fW>d6AKXzlkJI?pM>D!-|JNU5$o4y1+lO zudHEH^Pi-_^TicI$*I+z0{f>2F1@aF2gl_lPY;v;q{IpLb%E-kmQt z4m1kn{_Dk*6dE&bf%)4DJpjE7WKDx&!_v*d3&h-dI}smd{YIp&tXH|o{W@#s-zXl4 zq|2<-_}O+#nGN+FZyyl zx4=y4xnkDMW$P@Z3q2y7kxcn#XtCM9>Z_n;mTMNX7>5ZRMTHSS=wZKv8^ie<2#z$b!(r6Dq=Newq_XklU70_DrMV6&WVXc3pveZe9RqdUc}bcq!c(u#U*E`db|Xlb7oXgF#p{+zk1iD(urld{`fA2S z2&e)PTJzfJ6Oj=sHlr=OgBkPB;h)Y^qo~7aJ4(j(IlKyN1W=HwEjLvc8Pn&N0fIpd z*0Li9%~MW}ZHc7livGu-L1ZjTNySv~6DxOPHiwP)_a)g~^Ccby%PIH>H_bSbp3kfo z0USrqW2CFW)1>U-;vypY>V{AK8_Kv8?<>icv{-a<<_bqGlhOvg#9lvncsb$emy`XD zw!CarZR7zyef%e_Bc!IM)qd#McY3?lg;W2Y;R;n*J-- z7UTb-_rS{mMnprheh(Crszx+0Y%wQE`q{Pf`-Yg30W z5hw4TQ|}bif!0XdKTU`Iz63)O?$W5K3D(-&kM-*^AX#>r=H!L#{(m>u zWv{yzD$IFW0|eoGYfYr#rNg&fGIkS|3I?ZY+|#KB(+-LFbwY&0>Pr$pg7M1iSxPar z+V>VQv~+wuVmW-btYkl}c_@c)Ena3N=C2gmd7n!-^C-&vW;K&=&ou}nv&UZzcBEZS z!HKbD9VtDwkDXg&O5)NmS$Vsh-THSsjvnaX{7)BIww7?}pkVTEN{#ylH1}vIi(tLo z%#U~af$E!f%iwn0`hiD-#`L-+@MYg?CE1kA;f5?s*;p(k+i{Av!e8YX$)T#TYrir6 z@E*x7$B(%0k3YyP$6Axq^tw7qdpnu*hmDU6B;z;_rstAgeoD5twuhpj0hNVR8SIBO?j3A z&B`P)*Qe;NyHK1ywvTV+=Q~(Wyhf<%SXa{Cp87lswj}TFv#S(+t2nE)LPhh$u+}c| zn%vCbzxPtofs=DlG+vKCAfE2+oUBjg!lnO%xLBwE@h~=+zuA0O9c@ke-`KX2uOGiIv#krOz(;aXCBa&fx_Nrj3#Dt~T{;{C z5utxc%j?bmSTI((F)ut0;KAphAY~`8_z;AJXk&rusifgeGLaH{hzFn({=-|1xHL+C z`g#(5YEG^D?-1GzR1z%rD^1arv^R{uljzx>f}{Zr#hbe4?$c9M`$|}^7O^az7hDCW z%$e6o+j|Xzctc9U$`-Sg+wWPBkQPoP*Xv$)Dx5zNyZuSeC!g-!bFgrkhkoj8(dFQ9 z8=;yJ{Xj_%2C|IZ(yTORFEH0%a$7urCCgYPiXD{c{sU6RO}EdTy3o7tymv>9-E3QYt53ie>3$ zPp8e(rVB9B(gZ2>L_|6na9hP^Naq!RH8s`?!R~vs!bh{t^R3t)EvB4oUiP#2UMSfq zylCw(bLK`Zu9P}I3RrwbkjN1hZdlAyxyh(P*Atm;7)t5IttNo3H~e7rz6Bx~#{7HX zNBp(w_?3ZfP1D+Jpv(p9P_9CvSSYaBd7Nf@g^6@?v0rg|c-L)-uFY}adpX`pw{&xv zac{=9{jEI<+CG%TFN)E7MAG?ZO`#4ig)7=|P#?t79itL@C&aEXG-H=|UwSCoY1zUm zs+b(mi1Gw)5@#bGvq(UDtNbd*1WD&;8tbrvuE7Od~;VV-zEK zAnb#A-o)&iJv`OZfXariDi`? zGm6I8kVSdm-2h;wwpmS(QdF$s8%NUTjn_=!;Y;l@-nEU}PdHZ47en8&_#MTGnRL>q zU^=d<*?VxjKDv>)1++PY2re5ZE5O3LmZVLBN3hwznCd)_rTI~NF)k>(f2uij39)z! zAv{D7t`E4mjU`IEZgFMMvGsRtRlf%EPN4ehIPbn8p8b^efVZ(*;)1RE<;XMRxp<$K zu*Pdn!?}*4>p~q;+u0F&S&77AT1RL6%89=6+tfrot)R?E)d6~odn-CAyGeaAGD@4X>+BJ;&YZ7jK-h&*Y zDDQHP_ofh02kCQi%i+}s8Wi2dJW&~B*1sL;DG)jPSv9NPq}{P(hIUjVL{Wq;g5VhLN0UzK zE}X7}UX|U=5T3P-H3m=b|(5?OfepAxFh@a_v*?JF_x~n&?MdaM@W2_hF%@cqpI}_6oA= z_C&>!f-C(LZ8>=Tc-_GCHUneH$M*K09If2`wDu7q^0m4$bp2i@ z4JH5qYgJrFRTjZfed#8zj0Mn=R z+|&QEtyzltt>8-F*AFM(05m8PkKyg=1_X~NGQM>w_=Y7c`?7WnGWmAlhjE(573WXOI*}(S)^=s6 z>E+kRHD^gjl2&TsQPAg;)BSrpZrnRe9xfnocAE1Mb4?BnsZwh-t8On9xX}d3MqsM` zT0Q2_$=ee^Rh?RS-x6cuE|HciMZaA~?*K`iFlXtHHjT1+V)Xc|Bf*$DWXD!GeHr*y zP&7{BvfJby+kuDN<2^ZjlTMf{_D%%4t- zg#~{aCqW43bt%b{z(8&2wPP{gxyG70Ts;f3XG5OD;he(qCQQ2`)ME`*b*O==Q_Xcg z52r>pCScx&UGpKYC$=YD>)#ti45>Jt?&EeYrKz7y}QIkFD>HSm1|E2=*f@gdOeJQUu2xp+tEKK~&kxB+%@gAR3plfuxiBVbI}Ybyq}as2x3 zen%;A^GBabK%KH|p$fzXsr>IrUaQ^#CJo^diSi-8l71T!bo-?z2OB&$#zWTD!C@U`rz(^Hbbuw8B{|4R&CDFWHebk#er{Zy;E*jhuoDXE`osMlWZb_CIPe z<}V>xtrCvEvvga@33_t$HD&Y<_slMvG_0*D#eFV{v?lm}jLKy}*9S&l-w`$AP3!CR zI4gAppO+L_C_o5@5n8PU{aq@O@AAjZ0!W#5aW>i?sDA0j(L2ZpCYs)msxMTAPy{1% zVIdE4N-MWj3VE0i4rRzHeu7*E=Dm<)6Bg^#mg4CRa@WFPKyy@&?@+KdFL$uo2tjQq zR07WO5*2{cHUNFnLWVXAl^ufsjWG@Jy)=wl4v)Y{8lNxt}AiAG= zJdoO>|4=bvC2I6z37-e0}tKP;eCR;`&T5; zDjMljauxZl@rx~SW&zxH%I7eIV?k}J7jB(jkEyatbK}T2-d#Fp6W1So6y|hY53$nsRxp6J_XyWjP9XB5J0qd`m9w& zqlNh#y-bC6e9Ji;NEF(6&|nRn8o~YUDw`$j=~gr4OH!n@Jr+vN}|K0cCBQ_ zI_uT(&8Xw%`8q>!&@9H^3Kl-y_?JA9Hib-Y1KDkVET{q7(}`aM=f#Z{hXxkec-|F^ z^HT(eL_S$2+Qh>IpSh)fug82QzOgf+TtB>*@NqDHgrE*99EYYB7nvm4*O2On&-aEl zVyYM~61x7-SIAaZ0de$m&??*8FUcye6i2d6HHFfFP+D}m&z8>WqDLgku@ay!(Va5y z%!ZUTK&0Nk{~6sh!*Q@kHT(Q@;Y0yCU+}I^ia%Cr!->UYHd8MC`kH?}oAv2kwb(jX z58u<=xlTR>gFF9Ay206JuAR@tT)~eDo#jkHJ(}^j#D5*V@4+Z7Vrw|Mmq`dVEIP+S zd0=r0uK$L|9IvPCi)LXh92^G@QQFhnP_*Xi^kEuNYttzp47m6?7^;(mR3w>WUZ5y% zbAJc4+pRiFjpF3L23li>ias3fDW5^BlqZN4?!6*EL+H;Tf0B=!o{{%3T?7m6K=fEF zsKN?9z^V`QH<7DovBMg#TNZMwuKzF^m0QNpqcMq{fJ1-_)5S>-KEv%ieWk5f0?HBNX8APdPw1X8Vg`E*Z_^l4O z=fec8_8_zl>6TGaz-<)CA3nVKY5~u+;si+XFUVM&6%w2UXdbrRM&bT>F2Mk432#5X z2e0(c*IVqSfL|!5%L+OQgdV+?XxMDJ9^Qhw zDW`zmWz z-;3KTZXqX)VM+D8dhx7kLO0}v3^;FHp6_Rxc*CUg@Soaq8a6I~yaA}EI6qkqSWWo{ zJ@goA?_w_I2QrcVV_J&SZDDL_W$*v{=e|Wm_(p|`GXPBaZ!h8O+Sus1CfAM>L->4sR+z8gcPM!dJU#i1)|=$Vgx?rwUQKs9 zHMt(;Qmu`L2>(lN&|Lq9oTOnD*i2pYO~0PD5hfYuOzwuzTo%BFgl0uxQ%v zU+7|-Z1x&y6E3Jb?Jy(k<~ljEWY((%JZI1l0>DUr>!?nR#+&N8(`U3kSC15xK^|7zRA$gZ61I2+2 zfc^fdg73*Z&&Qwvv;Z;8e)T-8-eJX;d`S9Q?JiiAc2TN!CW1Cs$k|I3X%$t+ciwJO z(w=xpIi^i2I6&JA_GKtCPQz?OokQ`BHvTmK{5k+$j8C3-XbrL~e;qW#^_`S*Be`8* zNzC$vo1W{yL7jMv z=3g+<{^r2K6>BkOy5Uuip7^dV!O?-@0fdc1cmWS2+5#M%+aPs;e7nD4(T51BH5!n{ zA(d19Idf0MXDYPaz^=$z)V(KLV~zS3M>BHEnz?{MF@hzy=ywjPJi`^GDosVExi>bpF9=-o_Xebm4xph_#o4amO(ChPe z&L}X*-`nD!Wjluu?7WYXnCHOU*`iEh&9Jb@fY^8ZIoufovE)7$*!K20giU^Frdb%o zq>D*JR1+I*h_6K-ig)a4n2A{`;4MzOh-#v=&%KSrKd7;c2pP0nfweI-=<5w}bG?U& zdF_@%WIeHev2&6@j}CYdA~=r_l1|i7RuU%sv0N2L)RU$!_H={dxoPztvAMyfUPsIH zdgt}SzE~O0H&hDL;Vy}KN$;T82!Dj|^FK$8 z@zL`2DdX6@_iMn>3(&>)s%iY0eXLOADS!n-==vhfOi#Lgy|$QrTs9DhCQ?{e)EAMSvRWbMp{)Ish_?OX7;*GJXvw{Is^n8#QV+#lAhH%<3C_5ngBS##RJy}xe%Q3Pld?~ zlK!glmsp?+LJsS9%tL1n;FzaLa^D6*B2XbjRv$7H%-oU>b&Hl?oA&A+`LOjZ-3jmm zfnN$mN%<1oij)EJ8&mcW6ANw&&DWY3<3D~g#VP>a4P#&Uz2}PQRy+I)^Y78~j4hzC z4`>JR&|EZ`E**8=1+(-;3&9)HABu{AaEN*nf*W38pfN%f0%Dcn;+i>AKZ}0=`0_TO zS2lx1MMWL6uBH^r5|OD449%KMHtBUtKDcPBK?zUoH8m0lCz?Ds%!?K6AH(J!7rw9| z?^7|MXS-oyUk3^57yCa@({Z=fC6*WPmWRNXgSBGhwy1SCdktup9Uh-dak5_S%A@d5Dn-#9}#vD-~KR9rtZcjf)zF> zVTyJFbumWf7^e=s=qVThy0K&is5ob-soFD4jcR{X674%W-9a4@XaYvYKNCCl8TP-Q zRuAjTAnL6_t)`1G`nIk0vJy2jd?)>X+}-7SLzA2i(Gcr|3vtTuHqR)abYppKwaDRM zt@iJ~QIMMvHpO0yMUA3<`DwY7ujF2?RXgvje!R_%HA{yP;ew(uPS-Q7UGCby_Wdys zQh)rMTxS*j^!UCb)dYYP%QI1-C9-fo$zOB1bcTve0ojAY9vh{yeUI;(*@czwBuv8O zX~O^NlSts~ab@9?hN28(fbkX#cqHg&e@r8>72jT8?g+4o5DRgEUuRBQ6d6fhd$JS8 zVdSIZJgrXNnzff!{o3r)u*G&!LZERnxCQO_ekKe9$;b{hEMwG{iqj|cvV**_zt((~ zA@Rc$UqbYw9thX9&A{7*O4!88uLZe*{`g=C_|p&q%8gGm-)dEjH1ms>7M7=H7c#HV zAgg>Rtlc!oB9uT8G8)f)y*-WF%&%7Bn8&qmM|5ql$Y`lo2B-({BApM57}`XU`k&?M zK?*F5gK_~isFlcrYqyn2hL3O+6n_I8d%q$=D4gB9-ZpnE`+L>TvA#uWkNa!xb-v%5 zTYU0!YsSZ?&p|bU8qm8zWW{3M=G!(tHPuO3?%<$PKKCZ-3y(u|UKu;EhglyTIpYsK z82a-mH2vOBX;H>VQ83P4um5F8>6cJ!w-%0h`qji20tb_Ox@#X11=TP?Vh}XrYHCHQ zDR=CkGdWS(3!Q6tbis2&*XR=WO0z`NgJ9Nf62m_WF<3VuVWUU3Gkkv0^?t0|@iPS5 zSBH1`S--hbxmp@kA4DDWDdFv%4A<$iUhqO{RYQzmz{j%46MPIT3$vDX{u9|quxa%h zYP5-h@}s`jGR~A)%DC-yRu2)Xp`dPyfb_(ehnj`Lx5al<_gd;mD>?vbcpj|vA%o15 zy4vn%W*zOF-uZ6n`I1oj32A7ueoU!KnZ8Md3D3!Ha7QVf5~84vSvhE{pPl? z2R5!0;ga;bxl?u~w2)c zs^8I5$yi74MsG=YE&m+d?)!2m9+R(DXsLJAQRjXPih2d#%VT{}h9xCMv$S_J6}0gJ z2D2o;3M{=~`^;rh9Q;Y{z|Y;pdEY!T8@s&dJ2;QXqX`11?iZwAE&hm{*9|C}i)ojO z*ElLe;!mvRdca%LZ(W3ei)~?b(&aUrz_Y-Q%F6Pb_sh-MBaKzVN4F8+ z0buKXUAK9;)u}-F5KSMhi^Z0WsN=F7G$kqwM-Z$HR zRsLtCw7E%5x=J_cQIjW4Vyk<8@lfJtJi&I>Cm;r_d-D|%M8!4IKeNd2o zY3`3A!@%e+s$Gj;YaXU_eH7=rV}f5SnF5`q*JiuGa}MJSnngVw$T0B@E`|*DEa1bq zpG1ANc!vJ5(~)A4;c1bGP=}aRwGLsb`Qe%_g{REp)B$oVrGrap-|Z!X5%|Utf7qhP zM4_OL9YR?`@Rl@#L`M~>e%9Ak_0pQu(}hq#@F0=XVm42tJ-zK4NnZTZKo%3AmuO|d znfGa`?o~gMH3Gb~Ba^O!<#C`LbX$+OAUc?Fxc<(9O!lF1suiFAT>_el>#Ex0q zRR5#I2XWm&dP}ewxp44ks6~4kT5Pp0tM(hjh(+;D~huXPH4agA%TcmTdEQ;hB?gA z`8iy^3p?>0{3CIdN8H!9W(y~d0=NGqrm5s;V^;zc0nHoiC2Db}*D;sO6uVXJtj078 zt=U?l|034vdJ^#TuDs~${&fc&2PJ?a4xUszJep1kwOVY_DD}pCq0Z?G8qbd6I z&o%$EU@CK*0IRs+cEo(+!095tjQsg0EF^k{?mW$c3?h08~v|;;66w$5az9MLj#&J%OCdIGIq>F_X!~r@0sk?*TVz9 ze__^KWr^?pQYiF?bzyBszFJc9k1KZECN-+}mAnG)++`to&VW3|Bf5XvbIxHwPfqI^ ztOUGS^8U>W7(D(X2$py=SjPSi79;+U){XdZ0ehahkAz+=%gOgWyhyKF%gRgM0$M8g zwuC}zvgJ7yjY(SNmsVmqi2hN8P;|Vs`^pXXeXO1Gy-d8ga{~~Is(5ESCjZ*({6ax$ z?D^L2r@Y}tegktdn>z7UxurQgrSxeR~(=&zT$SbRg!T%h(3>C+Mnf!gV>&d0F`npe9zPH%CdNrxzoaEdAr}Q1bCn2CH_8A)hgw_b zFUJ&(1C+jbDtY}O0a-L}R^5#W#r-bbrE7UA!H*Og@@;F>WHMiv^e1BycXaF~r`<@` z2kSRImigM4iqrY!r*kJRmP?Do!;C z)Q?vipVbEB{U+7v%v~DDF!L-gxNA^=n3EgNzs+*m1q8S}1|tR@j3!~eLl!TuyWEre zU95>9WJlI5w1+Khzp}F_VanPiJ{?Tq{+(Ry z^R+}0pa>yLc+8twkb&3CjBO2p)_Ym?G7q%2VaR)rviqHR!SL)@OdKAyQGmhR|0^tw zDrEKimR&6k2QnXMSH{YmfHYiAM+^);CC&!q|Cx!5Rx-%wf2$M5OJlTfH6wl@D$K(- z$K7j^5%D|&WFdAeqyc&Xs$n7&iUH|Aen|M_Q$P6}JZ~K=Vm;UV9=192SAw#N`)+rB zq2qx`(0U7T>qW17pOrenC|BrFrcgia3I-8$U{6LVDpF0oWWENip9FhnOS@)b7cOp- z#9hcqFGaB5DvQox~_om0Cw+Q2&8H;=z8Ee7xNt~{Bbm$C`R?;K#R@;| z$*U+kD|$P%uYMZLUO?5MGg1C?+I?<5j5TKNNh)pyYC!dNi(zTtm7l9(ZgP|cE8Oe0xwR_7!6u=gi)`7RXy(mo+r#!Ryh;$x znd-_Y^%-Yurue|{$Z&4`blsJDBXd_~><48B`pi_Dql*Umynm$R!(r>T6tTjHwF6P< zeCtb`$B8q}p5k=^q_`7f-VO9UAn#> zD{uPX)bS7ZC6f>Ryq^v~X>T_W8XBUB3w;QBS>6C6^;@>Cx2=EkG+FD}zg*&5t4zub zK^KQ#oarQD7dP~!Ce<=Rx5Zz@5>xqI`xOSe&CkNCk6Zt3EHApom`PS3kEA%p$|m4r!UVV>3_xCh*g@j>nQ?} zD*v^7yED%&XDXZFd0ye89EB-Byzep>{PZJ6hJ#$Hp5s=#v-C}VDAgB|&s2n4=OuaK z`_(w;Z21}s3OaM5p?ZmzRdz2z6*8vk$sHT&md2W1xpxws6MMc@zML{1>Y3Ej{gasqk27MqFK{!6cIp;*$Bg(;9j^P!BX4MhZ{r#YJgDF5?@H z%Yxj^E!GhQ&erD|O#>xd80E_+G==cbNP+~h&-})ZK)#OTfSAWMO8InEs^iz*!tK_uc)uE)bJZ6F=deamg890 zqpvm9)DZ-FX6NbEgP`d=WmdYOfqKiu;#^7A-Zv0Ss-H=;AQB~1ENdDsNzVtHF=2t* z&njP2cdn7!F(D&TiIRy`+(HDLKc(+Y-*S}-kQJ*_nDc5cH<-sZx|``l(z{hdtG-%@ zI=_2tye^+z;*K%@OF_`2xNZok_AlzB+g?o0$uU`U{axI0FSpAQ`n)LqXY8A{G$m1& z0#c`GU8Wj!U27M08$-N1&HJH%EoP_wnw?hs$8$3^tBP}J+s7$#BP?YfK06VpwU5nn z;E*)+x@($6n5a75S=Kt`Gj`)V2!5!jVrwnOBubGcO57x7$pkbgn+02t&SW=`P=0r+ zPPx9!Nn3h+?2^dXT^&qx9wbWaV3|G48UFd<&nkv+wTJ0hmn+{~lt6jyT}tDR{__%% z|GheJDovJWw>bQ!p^3pTf^Rl+Rtp)2dSV6-#qX5kC$r|?^T^EIC58^7c$03rws=(4 zI_wE0nN!y$JZ%<*GT6ZLQuKYEt#{)8I)xV(kGJO(flR6lr$R!94vzZInLgu;1a|`y zpD_Alyk4Y#cb&zh3ZKcWWAow;P2{U0_T_u(?~Fz9jESMl|K&L&7=mQTYmYNOQki)- zZOy-QbN}AdX2S~WH&4=dxitP}E4C%v0JEg&jfX3o+mNN{tYWeL(tc#@haGSB%IqJ# zh~nfOO`6|%0{P!W3+78#|1_5~u>X4CoiY0ut~hxVFu5FiA>8;5PaFTIt$5_d6KpZB z>DyMvNN<|jhi$sQ#psjpuZ6=(*hupYD8HU#F~f?rcVgDf4w6=NEBA+{QEwU0-W;5Z zcD$UkHrt{81Ms}Ss2H5?Y9nRJUwy*%VY25F6H;VV^SuhNnY_S5cZ)Kk%u{ATlgXJ@ z?hrnW2=?IIESb&n*Cw_%3YenmP{M5-y z6v~ATpI;DFIQ;sN;r~=!hoPRxUjpwQ_2*TCWZ{^J$$w2>TrMlhq_Z5d6@l@ zLf6FsFR|r;<6u%L-jbJQ%jqy3dpnY%BIIkE*SsiwV4i1dON&y<%0=2_Ba`~8QI3x~ zY-TZe!~r{jiw}AZchns1EvDW#|4vUaubm(f`xtnX&p^sVxZIluH<`RK8U;&eFqgmR zW~FE?2ir%JDI#|=;o7kh$A6qn5WhyQ-Z*+!c9$c6CaL!DM2g%CTQ%#{7>Ul5J|`p( zzI(#H&_7%RH-lNX3Y!vcgBPPVc)!cQL^?#as~&1dy#0?+Wrx*=i)a^M>8o(}%*gsF zXXt7@!IYWO&j5FDuM$f2`Ni2I{E1yon*Hfaw+;NgwI0_?p}%V)bMzw1kETOqvUzG< z`Y>m)YM80EvQpUmaU&Hy3$i!FFu`FB%oL`o{FnXATP*5_@McuY^Bh*HtijkcQQ^JE z9-o<{grCNw0b}|p`@}C?`~a0QXL~INZcn`4xi&E&fByx(iQ(v`^9akVIwR&jUeoVH z2S&?Pjg>!K1iyrm(md;4JDcgLuQ$Lb7gO4?>6SuknuSHr8nd8arh5bbyTA3pU$|=? zFIUI#9E3U0YT&YR`0Hke`e|6V^hqgY^DPA3Bzs^@B4+TUgQJ<5E1{KNG!!l`MOnLXH40yH|@DyBSrn6*7b zAQ|ET8Ay%GGIwmAExbX5vOgec{G7=jR%SDrxMsL^k%s5e&Tw+PA73${yPHWX_KlWd zCyQ_0LXv4xWG#T?+2aB|LL>UBi3eAi!uEJY_r*#yE$1yA4czhbb}E_Gd%gQFo-|NX z$Ph>%wwjkYoAK+t@m0Z>=(-IkuNGvW;^;qKe{)wHA-uqP!@WM>jqO z*;|T_QH?#y_8j?Dr2f2Lfn?qQSEQpS7(a_#lx!qR4aLeb8uUfiSs41SC3cAc#fB6! ztF=pe%jLMU5cP^AhiuAIF63KuoOW_|XYEJO$-A(rdhKy?P2S0p`j;F`TOeJDYkr9x zDQcOG>qx+{{K8fFRpiQ5U*Sk*w!%E3oJ>Rzc5+E+jQ@AvF| z5DuXuNrP#&)?t>tRJ}7yKWwkx7faS{*(cJRTjf+FA+KG1imnmWc87sKi}vXo*>!^B@NP)j z9(4Ih(SdUE^LirJuPrW~_sAbzCT*p2jDw!y4qRS(Q4dms86N~Omp_*l|5|6c zE=FBKzsy|on;28P65%a}m)egG%B45B-B-ydaP;+MM7G}8AZ?2_Ov)});^+$U3iLz% zyTLSug@Jd@xbXa}@0hN7YzUvY8e6e@ll6W=h&?|bonx5H{OZLc2zE>_PF6W@|CCx& zU8N=`wJAQnvSoHWtr3sz>O(x89zU7L=l21M z9c9K4&k+EQ-Mib*Dmq^Tf=$Dh6TTC8d3v}-kbkh`K!hMHReb(HvxvM${ znu~DgbRFKebrPSXV6Mv*l&u^rAyu!Bn_D?egb>d-xEAsnT?~SlT^=vRLxu_EK6|B) zo7I^|r2N_j-kETw1cjWXJvbUL909(bJWp~YZ%eoyKAP(5^B)0d?u(>)+jR$$-t0R+ z*|MCVRztyJ@QlSBzZwcUynZ>;FazP;KFz;W!dHIC{8nbR`}B_KZYhn(N|E9@?m_+} z=<9pR9EPpR2w?UW-la zOVW8#ii>RseY@C^yEoVKtFGHbU+F&hRy}HoW6dfQKqb|%n^xh=3#z8xDEC^8meJ~D z{WB+A-B1GMP@v=y6bk;9W^*|H@ASie<~s33a%M_RWN+Sb1!g-@FXmC|$;^MuAYxxv zms;mYSC6>b{u<%%nw#n0LXO%qi9X=3Z)pcz^9Tl8!sb4)!$ED<1tTxxr_Gq zc!aSk*-!ux?PjLnSl3gv5N47VwAnz%9- z*Qd2f`V!tRggt5mkuo)8bJBAk6J@x;MZ8zv<>U0ox7O63m2id-IpEo%%M>~0%9uYe z9_^udn(N^4co)Xb4&kCvt$62E_WyA<%+qL>)xyWW{rYk}@K#a>iLmaDOtVT8k8t}nv7^LFD{kk6cQ2e3U zg!%A&@5RUOqoDdf(QeZO@BF}8;~zv(yLjK6qF!L7I*(ffao8UPtlkvo@O$G}TNE~Q zkaVXt%e50j9jp3J;sS2k>3ASPkX1n$coNf{TgYkzfSw5 zb84{Vs7~XR_+l#N^$w$BF9{Rv?Mf=mf;iymj zV0QxYbY=VnWuFaUi6})$sNR6ZityMRCwt)U*|R)sjy>PY`ficrA?!%!u&Dl01*1)_ z5KQQw=6!*|^?$4#i49I@(!ggrAKOx~hg1yZrr<)>_v%M8yxKcG(&y$TSDWD|@b74Z z?@uubR;+PoRGSteo#)>Dj7{7tjXY76!z6zTi#{CQUG&E~(i5W7@TC@Qvu>a+ge8`F zPxP3wK4V+1d&B9&MC?n&0)<$THQZxp@FwTN@LX`PUX-dKFPXuEu9)7ly7EfP3%!Ld z>6U6f??)rJVj5INUCfe}K5!8*KvovF^uv{JqOgZnrIDhq?M zQK|yQs$sA}_vYHB2hMueX5ipLpnOJ%VagH9LEs9Z6(S=vGyl{3CD3!FV#z!xgB6jm z7X6St$vloHKUZ((fmTzSXr~>hV`UC82B+{bR|vLNW_(njK;z_)^oAfT)+nrGTw#uu zPVAkqbBB$xOnT998HC5ZarJffaux#%!|YBg4s-~NMLnkWg?_Tw>?t(<-jchE{#nAx7C)HaK=GUDVccd4B8t!H{^?v05Na1 z+n~-+=fRts@>}J9}ckg#3&<-ZYD1~X2SE|u*52K zJ^|0XND{(+@rrh(04Mlc5M+pXWX+4{9zhruTfFh+O<10|ZIhEy&m!HMVn>*jelqDyxL%Bf#x>8H4 z_MV1>yd4`t^(b!!#8Mc$9*p}(>sVn$%mM2vf-Efk65KdrC3LFl!Eb$G zN_(8>jQL0br5<>-YYVR<0S-ylt0yB!;N(d4z1uJB;LuG-68=Tiv0Y%TDujDPTaa$* z;F*9xm=Svqu{ICmOcD8Qo=|@BtIBe*0Gt%~iyq|WQ30kuMa6QV-X{YIG$ASmf!B~Z zB-9<98WSY6KIDNYzp?ra%L`Fi0*wDAZ|xf0j5ZXHYtOJnKpsNsCl2*)h*^j3NidK8 z0ZH5z?|1MDYW2@>i~2p9o$dcV1?&`5lL*cb7Z274rxBEh;BAfT3g|b+gLOtH_d=lI zZi@O6I)*-c*LSx)XqKrV=Y4LD^DmW*8!IR&O3kfP7ziJLw*?_b}F|Wz{(J$T>^bdSFwRzBPja+(j^TYT?mmLwZ7QXXGRlfX5i{luSSO`um8&SP`26gz#v?v!^#=LZjWyG+g z&!xoGdh2$$-@-c6y4ksW7~?LWP^n%G7HR@qF8UT(LJNUeUn(v@bo+Se2lRcvI)%Q@ zuj25MRw_pJ3ul3{ zNY%ps5i|fiN?F2w{=`}cf!pcPUQd0TNt7-O6&2GrU`V=svhh>Z^0V%KF zoXfL5gLI0gVFt_A8*}^O3&SFY5kYG~y1&@`S<*4kY!p~t8*@*H)9Hp-HA=Xt3;}kV zKeA#Z@!u?jQZ7CUu0WtapwK!&d>XfuP6!oiB&%hA{(R_T$(G z-Jfn>PyGtEFZT0oahliC#mBOhnicZI3Tm@EpA4J&lUkODqbvvN=FwtjRKo~H68pNw z1E0N`!oFz7X2XwGwm7hAxK^`=ve`AQeB~ebVZ!40e$&u1^QC9&e?^jdkmnz_r6VNx zu9%nMKSCFZEc?}snRda#;pD0iH3=an1w?}I*8XeQi#=F`W47mad_7YXi?4KVa=9nZ zZ-7yxuprDJWiKCvrZ{!`H@G)uqm=vY(dqL~IcA?gx}|sKch?)0eUKr`u==Sw{hC#% zF-A{sk@kC;U5=LTg&8dg&jvHRMc+aCT^-x5k&ZndHZOj7c7VOM> zRV#ik7t{9+tE0q9x;_}JMymh&nSi($4LENe1RTHAS~2WIGNKH^kjX*l`vULn-=*^h z-)cC(ckE6UlcEO4nh+IQOk@3Urpd*gubk1n?s7|mVT^B~uvD}Bx6di1V9-VjnSxQ+ zh_vg#z9f1d{cP%h&$@W&VU*ponH>UTxeyTrmz+swggcQR-+);RM!#l%urWEQwhq-+ zs#Wa04~`w)uyT4_>Thb(<$L%M=;FIX)^XMw4cT&Z2qG!B2sw5KV>+xoyQOZd0gFC(4{r{L%XmJKyDu zQlx!`JN=tngNjwt<>eRa-8UGGZA8ts@~M}Ps$S4*8_4q#bIn3sf>YxQ-SdOV75uyW zFNBq!!mLi0)}!h*{BEhp+cGA2bAzk$WoY);>=ge7y(r7sLh9XuJ(RPNKT)d46uB~| z%I5c=DW8j#;Tbkm@q*{9o|`Yq_->VNZ;Fo%VtcL^!ikir{c`4EUwqD1+N07YO_`8T-QIY5u9|7*JxU1k1s4nQFGa(){W8XqQCCU9Gr zt;G1V=LGvH7as0t@|AlPjJ>p%gXQxFXgTJD{@aufB(e^)@@$8gllQ1+-?%CG!NEuzg@29hIZZ7NL1tgvg2B>WJa+9bgPUxF$o%6VFJHJD3w@Tx z7KOBU7mg&odikf}iz{`xY>Q@jTV+-tes?~MN zu{Yd~*yg&u|8VP(X%f(f(Hbl^?WEmG*=y<-;zwuvS6KUxt?fW@q5aXWeBn9wzNxQ> ziyvk(Q@-6s8N5n&kP-~xpm=#dgjDYA_2Ak_sf0R+(BWQvy>@JWgTLC1B8FM<9N8|u(#AFQp|LJj0E3T6U0cV_KY7n^%* zn~*u_6T)InuvjnzmRz!pm%=*^aW&qJZ{WL5y1;63=E`<&ilf0mqJ zY0i)AHjVq$4!w#(Rk%x60s9^=*vm&TQtHIRY4~R(qa}yDEPRmvq}U)toCD9V#_BI+ zuCa*BoK&ST`+NhylZWfPtRI`o6YTNlPT>j@u~A0 zj{RbBswBe;m6st{F$e^lT`Y%sjCNU_gxd^9{~j0~0qPNUrs7DIxyi1OJ?F!F$9k(E ztD6C${ic1in^bD33*$j38L-=G=W2QmamHf>tA#ma!H93A7MUX`N3<;_^1nX1KdT-C z>TBOwvW^%9I{uHNv+!%`|J(3jbV_$f2uQa`jIT(SN`sViBi$P!Dbn5D($cUIN_PrK zNcZR(+q2*E2b{fL=e*BOeeUbN{X3FkME)~IE8^ch4DmM~6uC>_b-{&>D+rap>z21iJQ$x&Gwj{+hA}>|!`w2ek zg)bf48qw~}^td2ba2OyBJIc}ICTqh?Bv%g&T@6-CY0N8DC8 z{3HCQA`W|Mg$m4NEH7lwIy{ZM*eKbRAb~bxZP* z+y3&un(VV_u+Rd~dr~hb3yG*A?JTeb`faEVM6=H>wHz)yckD*C-IkR;o~R>lmD|fa z_stIFPzAIJ z-9g+-l|g|b5&vhuWwpImMJ?#4`$+%Z+g&(B!ji039sKF(#(7{~@gbXl(qNO|<4jra zKEv|V*VyTQlB*dSy_$DKr)rQ{+g^q7+srSL#Y&07WVO99r%(>yRExx)V1Mzf{3 zAsT7W5qZURnJSaeI`w6~kVWh!3f;rs=}zNXt%DAjN7Pv;iT;X-zFpiTK-)$((g0mN z*v5LZ1Je6wBVCyP|DYGqwwFM6G&+s#Vi?vGeXEVmYBHYoiW}v>K?DLd{lp!U(fA(0 zYUux96Z?Pe2BK>p)~D&W_ohKy2T<&EOEczyLZWwvbGc(Ehc-^+-)8{dc}FOo4m^Nz zF!t{{gcq$Hfr&eqYkv1UoAYb)$9TGQ2fFsVwADyVF2NJ&TMQy!dyzclqY)aM%Qx|& zOJ59xh}{GVpg8H9qIZn0)tW!+2u=bujf|lg$Q5_(3+X!L9Q$&A&$-H1H&G=}-^~{v zlY#GaZpVvU*SaC7MUJk;a5)~rR>HdLCzb+~fXvqwch&zk=$xyZvrf-GYM_X(?YUSs zR9t%8)Leci!|jYNJQV=+il?_ixOI^(Gkc>bem?-|=Ib$4)QvZoSU*w%j&}FhZ7R<| zdHno^Lj(vxu9c&BKmBXzp27(C(WHv9ob|GINX>U!bQucQ>1Wb9>ea7xn@(+{2H5hM z)I$5ilTJ6rp^tFcpywyr?^RfQHmjUH%3Ny>f8yOOrFbjtw zUw;h8g?GE=&0mx_IFEX8KT>2fiMCuQ0^6!$H5?i8-8mmvRB-E-zA#mI zf=M8f6!&0a^lxoqd^CtRYZK%(?lPe8{7<|MC%-=(J@ljFFIidFuFGPGAJOmK-%R47 zWjTGRGkLyooBYj|1+Hr^pCmVU?{MT6%azg>;xI>?gR;v;CuQ%Ldm>hZNQWSdFzF1v zw;;>nXY~MIG;=c(1j)#3Bv$vpGZ;K@%R;PA|NAviV){h(^cnMMkppbrM=V__I{O|| zhd@Pw>jI49+Lw+Nb(9r~;zJ};(lrEj9{dqtbQu*B9mK)q+Hi3qI&1E8UuxH+Ao^@} zz+oNFY%9vBK{o}A!HCfy37R<3*+~6hZ}2nZ={&ck7sY|fSHCX;cmfu$+K~S3Q#Nh5 z>o7@o(zKK>AgZU!zWnc(cVNPv6arIyYROsocFzBXvNupe06y6N%P^!L{$v$xsOyuz z`hGcs$O_MfKkEq-eCuRl6vev}1@{Uf&UQyzUP#@w_l+ukzHD$cx^Sh~MpX%yC9JWuIX)%6dIqexj$&F(9u0S7RX^-L9%fBnaR2X2 z8_-BtKe zfhYokc~8t229kOv{u#cTyj5jP-!*u1R_F3mcC!CNuBO87L02^w{|W?gvxW%_uaXY5 zt6$h@y_Dbxv$-Scng1957~5Fj`2v;u8A!W)%O6C%Xnl%vv5~$r$ylBDp<@{{0rWrX z&8h$vJ?}-?t$2IM2vi!gz@%@PJ6te5-bD&s1u=lK2C$&}sv&Pou#1jF56Se9<`IDb z1E!frx>)%O9Hkv==F0+lVTzsiEq#-3J&oJd3HIY(P#05QJ*|&CMr$2?l^~2^EoPR; zL~bKN5U0Zi%`hPKPq#C#93)ZN26GTD_)B1Y4fAx~m7A&{dhr7l3R;iGZo-V$2~{>Z z1Ifjsh66b7jT#$hm>jGNvn44)B>tTqp=Tz1^=*Fgsw9g^K#5@3n?gJ2%|3tl%$5WQL2D=M+!f8UW9(-i^qPQfT0z&fy{2~`Hz#Xy z5mWrWl#EVW-m0Osy>@V78C?XlfQ|J+DPZ}!Lv&>O$T;Qk{O66Ka;v|B_;AYHt0`*) z#U8j3Y2FKLd)}tPQ{$jnKT0{ltb4Z(IeK#@xS92x=xuIeC9_wRtS-#S9bpV-5D<+VQTAnP%Y-Wzq@AnJoGZNyHBlK zIQw!zgzKY#i4k*Nu{g2T;h}bOab6x3arL($~r+gkm&Aw!H z)FF8xd#=9jKPZ>>X}@^pXy|)kcEIxDbBBIjPH@u2df3rRR>k<66vryAWdY-VV1-SE zi@$CvA#C9@bF>9-iU+k`j7%!*oYmTAd-WX}Nx_&-GNmeqCW-ys%n87UF|rgicFi_m zyH&^{6;kQW_s?rQ`h~mIbcFsrNDy&vys02>F|IV_(|jPvUq}RN>_HYbri*)eBy~vA=Jtgp!exE`R-p2VX=Ov5Oxwl+Fm;up(z@R0X>oy6KW$9t>tP{CgP zo2ZV{X7bE28Ih7pJJ;N@*CS-H2n9`}QiD;k{!y&0cGIG`@4XOvWaU6lo=Zibcvi}U)`4}rlqtuhuA}>tKTdVwq?o0G}vO}H-Pfjn?wt~E=`YooCzho{M5@y z()Piuw~vif5c&>^HX>gO&GKUBS5Lkh|1;fiRxwGVihtDFmSC=IU0h-Z)z0~j8v9*b za7-JQ^Oc9jWVAIpTj`z*v8})a|#4__~>$#ur?~N}+&Ey9qjKn1TS zvs@&opVWPD<_8)3Tx?>n__eb#rH?_ws2zb>wuF!9a*T<`msZ43JCQ4_72fIrDvGS* z&hON1yEc{0uXOx}0vGrc235X-!~ z5M{|`G1ej2nbR_42-y_ptdrB(eUZF0S3+p1fvG@ZiiAR-;hRJD9wF!+}K#-Y+6I;5gOT4meiQPvrBXO8t`Z(C13o~q7nEXTKTuENo@n_Sn3mJ$R zJZoUEOXEPIREO7XE~P!PAq&r*1}~m#nTCWwps>9|CWj25-VjA@!?Z$a1OGQTHi8wA zCHcfNVn@3xfw18KMWet@cRDHQ-IiRnOozMpcL&3Ja1$K0HyXTzUIi4V9)pvheqVQ; z#0iU z5%JnTzMUhZ5WtSU_`C=kCl3Zkl=m-^Z!emG40#3>3PL|KOTNrRA*qfSGaFnQH*q7F zGzB|}_UUOo%E8wGa(MN(8%|u=Q+HQSGI3g;453@G^@#m8;&}`WKtF%G=42 zkH00RpY_jK`gVHPS=?6pfsFXm|Es3B}je1hf@sWmc<^llI3?ZpL}nijSR7 z4)o1&1nHR@m!cxJCxPV&MOhrz$X6&DjtqmbC*lJ~m5^meQFXs2JAOR3k_1nvU-@Dyo zvD1__6@v|G;El;@~ za<%@oQZ$OJ;~D-_!?!_Ix_0=YLEQ5b1;+E2vltfX18GtWZtpU%8SF)E0tRAcuoMnD zD<>l9JViTpFscmG5t`!jkhzDn*QCZa;A!G_LmQfzF@>1?{obWW#}{H56b1$YCoyt9 zYZzB?YoH*(9?F`(-)ysCwy^vyhW&;2rXQe*0n?MN9V=YP-6*5K zuETl4pLHD#u(mnc36F^A7;qb!W=0bv=Hw=QdEZAWD%BrbGGqe|Ii-*~J4?S5dL!s} z+<3HQ0#HL9)jmV!}`$qLs)o+OEzmNS3XIYn^cJD>b8E*$I#LF?}i3~)mDbngpht{Iiax*%- z)ki_`%kzh|_s5Z&cIzpUDSI7^QDw7eIelb!t~a%OmLs=14s}ypeyh#GYN$&pDY$AB z7fN;jaB}=_m<5TLf+m02kCn9TO0PLFgQYAvk%J2>mBOx_3}n;Wu|D)9 zxf@1?)DJWgZQYGXacm6r7kmR~5!PRX-ahZc3-BnlKwqeJm5_8s18@df1Po_TN8vOWjN&!-J zQ}I7n?s$PdQ;P-8IP77cf3l&{w;^JqPH~l{>NZo@;)hI@V-~lafH)y0gQ-9x$pIOAFAQfWAk`OMzEAlZNkC{K zH&LhMrcWTnex+&Pdd9a+u!!}F6Wfd&;NQ`PZtUxwHp2L=K?_lAH=osiJv2yU58H4? zxF@gWG~4O_EO+mEn)rw#g8?Jg=vAe?lA1+!c)uGYf6G1hK^gsLFv58A%Pm@7CW(Fs zr{s~rJ3w^dl-?m5PA_VL4Zg>N>=HdQ*%7u(k-h7XlIK0`9bfX()CiHmyLo$oo!>~E|!0p^>KNVL$RTv2iar`dK|zNFt)?8ssWg}qZ(W&=P_JlM+9_kA0< zPzpQkN`Xr4`ij5xv1Q-O!i2xa`P9BDnH%q5S?lQGl{)#gf2yTlsr;l%-I}zYMX9wg znPr?rF7JwYT;Ovi+{-Z5(ib-qDj3V>t+THk#jRZ|l*9&vW}eSVIIz1NwB2J9vK7I} zyvL&0cbgZ%WX75kpD;uM6=sj)_ShWN%Hq-WlmNV`eZ#Tiebe--uf3xy1e@r%lNa`| zaQ-JHX0K{RH>K)7d_ei<$i0^=I}5p@tJ?<2^87FF^~s*5wgFpo{?9!BL%lJ=BHv-^ zcL#qo<(cQP7u(lNkG$1ovm<4j9k&!JOHd$-Q=i2FkdzRqmOTFMqst!ri@u@hWt$*Y z16gAS`S@EQ1UgjToHcG0nqs zq ztblPqW7uL+zDZT`s?|0T(^iZ?mJQp^8^cFBqrrl0+d*_Nzn1yEN=S0+hvus;9$tDy zY|!1ZAg-|bFpl`EY;qK?kE;o*!Fmi?24XB6spTM>)#687%%<^)(zA%eAgs%QaNLi--1&L1<#0{aXOCFVRuYzBiD zOIyRS1;sILJ~@w9c`h_c-1i!eubqogO0S8P44f(B6uA8s^5{%lc8sVHn`e`B?85q1H3S!4xs_mi&CU%4_pX8E)~xha3h*k&OiCY>5q>+EbCX$7_k zx;lKaRET^D(hgT*n2NW;Gf$pA_jDx%h=f!9++6Nv^xIFevcYPlF#$zjC4jCV*im1g zViB8^{ozPS!?HnL^VD-z7bJ;m`x@Xi!m(J$uw{c)C|hku6j(5Ptm;}bIZjo#Hffug^_z9NDDhr@W=l;qXtM8?cq4WEJbzi9J7D!fF4gEOK{%EjMAd(moHvR=^9 zq$Ol-dsb{heB7pYOvPRJuU@vHYlY#ens9-gW*%pn*f_Z|)?r8u$VS)Bei{G2*Cq33K1ZN(CEt5kMfj*JGyd?eyw<%P++S=(iuw7^R| z#iHtFGL0$*S>QL{Fr*NmeV>5rSe%bjmp9N{Yp%_*LV?{cIo4`lMatdG~Fi*91sx#yu5_YB30af|fyALN^h1qBKF_o?G4aBrScKj`Wsv8~DWBI2_u z?>Qw4viFnqK(EC73-Zzp!OjkqaHy-1Fw6P|2L4SbCv12u>?YR1I!*+rgu8Xvi2cgW z*50I46{%-n)g1D}V^?(F*UJ7{{Yz0tnxn-CgKG5H z$b}w;pfeSnNr@q9Dy|Ri?gcciha^v>b-X_e`3K*V>CZB%!X*rkxp*V?ornF5oha(j z%Cdt{A!Yi)pvK^@9IZm<7@$aE0;=NXzn2v$xHg}c8~N$3%!j`HHP-88 z?;Il%iKLKIZc;JU;h${zm9&^=nD8Z40E^t_HpY}+vN;; z$q0ekI&xr_L_I*Q&ub82CeJnd`moy^o5iP+8P<{oc95*8LJnpnby z%Q}e&T4004vx{d=yZcOk z*$(wf&0rz~SxNQberw%L<#SV$!(jmlsjC7apkA3Z(TJOry^%uS9cD##$9nXRvJPO|f% z-hkn<6GsUwgSQ9fvz?r4sTXJ1&pprXYWxb^(zib1l^6m!a`b)gdEu1nxT|3wy`|J| zT~Pj6EVo`On1a=}>CMBNtAI}$TwWDGJNFl&l;P>C0|l084R!vyc2g`-NQ!<|)Rh_^ z4S~$@IN`zQ_>luQqw_w_$l*zbbk3Mh6`39o7#xqo_U^zjZ^?Z#`a6h_T?2*jMJ_!@fdjrm~lT_&VOi(A{8O) z|2{>p%{7UJZLyq@#sWb)Yw#HnDDl2>u{F|rlrFJ;p#8r(K{LIOe@}4|r{tDxQjMFY z-9Z*DAr@KY(X3%Mc2f))snyxUu6Qz!Y1kKhH$xbQ^rEuT|LDYTGXj(Pw1%_|K1{XF z$CbAo>84-P_cTt=hmJ)~H859%I@F}!1^D26mek+UZRt$mg2fgj z{t#}4Fg70V_jbiX!$J2X$~)az{?6%4JEt`YLeX(i=~?-OLf|I&G+kjMqJj0xS=>1L z5xf7Z#ukRwnf6fCwl_?Pt7nd{HlzP|-7fbNzj+9%@V90<);Oi`WPkamtk3Y(Q~=Z0 zKl@8wqY($@5To&oBjxw-_|0FDx)AKV`C~|LB9a;Z3maY~!#^4|IMLYSE0X*8 zd)Ef6=X;7$Jg0robk~DarUJETPRe#SoGz~5BODYO4`>piq@S!QsV??+=h}os5KN?r ziZfq?^X>Y;tf0d62PsS@-8~tb_)~fM;6Yjm-3=W=OoG5pD|dN2Za6%Q@i*4& zQNSm2AV%x#fl@F3%!t48=nz(KMafr}L3Z%1>qU!*0XkP7`aaf5j?>T0@< zRnHaXnaSxLb4Sn?pfiLirB2P!)CIm}z#|s}CD_ zGGDBC$%tZIorx#BdWLLYn+eerFlEeK@fRo1NdYxQRy?rfrnycCtJ%CIX&;|5UnHJ% z08UmQLj;6H2rxJ;KF#Ru(`5kL3;TNkxOCq@<|7i$!}MotAw1f_+? z>VTb}RFxq+4rgtB0nRWSMl7UrCS0+E1;c^QQhRHE9o?M9g==f^i6)iSjf&Wx0bNN5 zEvF2yA<-xR>qkrl?=Mar0M34vJ$cwnC)W=9n$F^0VM1EG){Zk1Pq8@OYl7Md+=A#W zC%qoV{KSPw=`*3hhGBZSfWj}viH1uRG^>B#h!I@ol9V&`fK4fi`Xi%Kt20Sx=yx)rPC78_ z)L&?0vo|EjnhXIAXM4X4PN4Ms>|TEq?apktS$($P+lyk)P46mYFLv5}i=DBfCinFx z!z&azk-(SF!e^gp5>oTIYvnm=Vn*Sz*hybC>VuvKY$2HbfVrm6$xpBqR!-(eP{Vtr zS?s0$&`EH1k`g!ej#cLJqrFkqi{+o}(D-Ieh)1<%#@toot(+D@g{|8TTdS@{a%8PP z>&kZ<;tF%}!|kNTmHe^Z;|k~^cA`S4UNpThWR4A`*##VDH9| zGB)rQHmGXiw3Oy`)?p@w5(ky3#P3FKe)JRiksM*b!ZS;=eKIU-ddfZH!X+3AVal%s zZyH_fidKWBb>Qt7bsOXR)>gSonUr_7L@M0U=W%P*R6xW1xoy2Rfg0!7d!+2I*Jm54 zv-v{7^CU@5-;fm%^V>a~I+qjo>#aKUwMGrnX|fM5{~=4{P0PVlW)69q&EBixH374! zk7-13OKuCmT~VruNB>mlFS$}UyRd}nj>&g_XOrDZSMSKL%nYyphH!wS;q+#z;G+9s z22b`Nm9aGvQW>5}^>G8IrFSJmm~$H71+Ulssz~#d*y(eT4_-oyBP$uKAUCg=g_Qf1 zjO+bTQ!~j5X8vLRY&`L)o|kSBn!MM>Wg%}-FS@>%EXT^Y4JL?Po!g0ASHoN0fK{%E zPsrmO-EE;CF!p7;!f&6WdQsIoMz&!dn$W4unL5j#Z@`#v@$sY^eUh^%wRb2p#F_pb zv??)=f6JURPNsGU%yf@8EAfxFD$To;LtF19+9uJ@Zb(YX{n=sSMUmL(xlXMx;%tLQ z<_j16H>obSSfZ_w%+(x~erEtNw66{>qm`;bbz?;Q)!A@TRWt6fAqZ6t=ar#nCU&_R z#JR6*$b-M(aPB9(<3(V)A>_ZmSxKbk3{C`Izd8i0^;!852?8M=R4U~st3wgj1vk3B zOOW-=I`}2rKA;n<1jH#T5!j3=p7Wo%MB6INq@UU)N+ypUgLQOWhwE!M#-L|l+gioQ!pX13O+8!Ok84$>TtgSoHh?;0!k>Y z#~__;DV4qeybmx!Gbf^5L{HlCQvArt6k>OUsX(H$O?(6ubAQBaynEn6%l6>H(Hp^a z2o0PNK~(qhVm%3OlJ-)F=adH#gazt7`z+#B1pUE(x&@RiD*Qy6*RNd!7D0X`Xwgh~ zxR}jHpIw~@nk>9#YusT)6Rj6#TvG7(jXJo=>vy&)js~LGw%Dwp%T-Bxg@|Gc`GUhQWp^AFH>41l(B3Mb%FBiE!(Hyc;`Fn_ z-794J*45(AOvz8bA=8H4t^d|$N7@gkMa4_mVs9P2u{ZF8ldfu{o~AiO1%dxe1hl*` z2MF)zLcZEv z+kE^4B+ICojEDbEOp+|V;HHJcM_uRSov!DH1L~(R^aLiEkv}Tn>JzXSZU!>t^=W%p z1W+M=I)4jt{sIu;)HcsT=-!*prfolat`VZeiHb+_L}r^{bHmw|=+~J$BoiDUXzwJv z?E)=I@RZONXoR)Tyd4gB5=%shcY`MkpJmns|2l=j;?dyGi(C00K03Sdib!kLqeqeu z3e#dE1Z+~VqlAoDz;0?Ys zm5pTCwDZf|2HOKXEbYg1Vyr(Z?~{yEfsQ&ewDv(*CcL#axG*ep zGUHvH?W71uaFa)djt0_m9pJxP$+b{kzbEO3Il0BhTMl(o-D=@*EN3r`Gv^8NKRC^Yf|mgtH`0vU7r`7yt-^`Op&SeE=APxZGY_&6>n~4D z;4IOW&ao%es(z#`XdcT9-#qB4KSE;6>3d5<>qRh2=Odl^ykMa0nvc)R6T*k}*imic z__AiMCiwFy6eofJ)PI)ug#gE{A`kM zk9w^?lhs0DvmXs{6hV7Ym5m3q+@6g;mIV3N(ABE{fh{a-ZAugZ?*-ws`dt4=(_9#b z9*9!zg9LG%F`t4D%x8)0#a5L6Zu)NSJ@}*3)A_q|tsT?6{_ENk$*3X!PfzOaZ;jJ| zdP%7F_2f+8@I>aSOPsuZiaVw?|BEsYu&%qpwQp*(#`2?dSM+Z=nu$#3kIPB-=tKTN1;X z|M4?`!LVzp)n_MI0lMAUL#OOE(%tgj?eF~a`e>vboYG9MMmro8{1mg~ue^V0xOutI z1|-Lk;Wf!;x5>*3NkFx860tkVrXZjR2q-%L^IYb=@V+Yc?MAnhh4$bM?xg|yrdN?` ztn7b_+DWJ?q^(2&8@bOQX1r!MBC)Hr|6Fcs^#NXt+fhy(?)!jLrr23u1bHkBI!>;N z?A$ZyPxfoOnRp>o*UsExy3I`?`P0$M6f~rRU3=^3^|nQzl(Rg=Lp`y9n2;aq<*k1B zjKT|3pdYFDyx79VOO0xH0vjY9zC0;X`|%O3_G_YgWS|UnTB+70;VL&B14RdpPDfv7 zA?Ez<$k(*D<=(-{Ca|PtLebwVSp)kL(rKo^m}doLjmJfO_o0o3)i2FSU-Wm3C=I0RP+`W;kIl6C1>mJFzje*|3qV3MyypJOf-T9$hyKKX0E@Dlq*$l^ZJpT zy`Y~W=~`6UG1c=dz6~K=8+pv&Lk($;Z_uulUK%&wbvl{_;AZcggNDHbOTO3mnm>m* ziB!ksDDFTB$QMZSpZJfe>}yfT|Ai$t!30M7%|A-5$hH+9I&tet(02@x(z3(Zh@Wm6 zY5CqRdMo;}f}GxgRco?Rq_Tv?P?AA+(|dT@@|;;TQns`7CCIG=mHh?Aqo7+1)EY`9S{^Q-1uyL0k&s+&w4k< zC+T59zh<|d>aaAMsos=SPJE<8e$;qK2c^j-`rZw(+SF2l6-7Ho7eE*+KZ;KHfcsUxNzJjY>!oGg{ z|CSlOL2<&Q_yNYlR6FDZCqXOk_Qu;0la?z1oPzlMT3@-2K0mu1xGndoSyams7>2MM zF1GmQ$6cNgt)a_;Xj-R7rG?mp$&_xHCuQdcY$9d~BpKXns8 zl=#J`)91=LFD2`Fv7JY#Rt&2DA%7v(lQ3%?40h#r`u~1An0$+pXF+K39pfjh2VRc6 zMb=e*^!_vPN?M2DAehHf_r-J3dsV4qn^Jb^FA(2)&pd>kV>y2OS=ByB?^(_=>VJsX zIfyeTO)tSkVKK`tPEG~Aw*!9)tKq0Xg_M2TM#qKo>8_mic;=i@*(GkPeL^ytD* zz(LHY_ML@JpqoHazUJ_xw98@4ZaZbQn*8?Js^+ww&g_ZTmdWyJjyOifJz~$hleC=9|6V zHiq8+aue{{oz2PBrjVUuDF7iYe@|3YjYcp#l+WY#KwTtTIY7$BUr60+>s@1{T!>DM zUY9-!Urnlg-F3b_N<8a9u|75VyiqKfB@yhI-LX@8f)P_$k~W91ltO#^%3S8SX z*VbERe;s;2w8cS({a3A57Z6A-8$AA3ej9kiqn(T5TCy71LW^v%#ok=XHhOBm&Q#T&8(K5@NFz3I-)Ped8MBF=fwUry*+w`+& z@Yu*!QL*zQS^8=GldPgcKl0atFgtGq%B zhcJlXN%JV3F(WQrY61%x?&LHH+7eOtrZYNWrg!e3(Ywx9=#2&JD zLT1;n9t2ZR5iSknD3A(>)jq~{v}b(sj{Ck)hFKEjcOMV_ZdR8?%s#=jkK(w;KjUb! zv`gM+!R-`zO0_*wMch7~Y(B^AS-hmUGjEN{>}A;n{~cND*4_a}y_*{YeL>xvi}yp3 zT$|c1Z^6Gwcb>_zbhmoY(e*gsg|`hBhbsM%aubZ0n>--88N*>@8uV)XrW&Ow&GR2E z5U8cp|5ffSSiO9sjB>d0Uo{~#e|l!pVOB$jXEth5E(YmeYPekG&%*N-!Tn%0JJIYB zRIc_u_FmO0bEHgblHj(6cQ=?1WyhzUZWM*Wzi298Jdqy`osI8;IwU9hL0m^9*`Q@5w`OeG$*f>`!6=auNDVT*z?ZX9B=ZKP&GgK@`92O8V<9j zFUfz44Q##lzeFgDl?3tsnTq?shdwM;mYSWxyCvXh;oF8QN&a8c&EKmc6=+d-_nG=J z^|m&B@jIyI#vqrYR~kns%LgaNo4jfA8-7J^xu<})qx*H$zdf&1@0x^0L?rK;vc5}* zPaPj)vKxolv|$=pV%sXhqGLX0yT3X|UKya7j9M6#A!UHqykL7J+ibg!7Xez5QiIpu z@nOT1SnTLy4x7mrUUzgIJ*M^H)#{u^Ra1Nk$a~o9YFCq3j2BHCV5g&nB~5Hc(J>kf z{cN-MS9bMS2H#QQ=WaUQ3pzvGSFuHE=r(d}1KsC`>EBj66&OBf_<5?mE4wvMBd#+_ z1be>vw5xDs9(v@?wEGKcdTnwFJHSTdhgZA*YSAD?f=iwTlNg5^vGJJg{!6~paMz0Y zcwy!XRywnlv(Rs%Pc-{J{y{i?2Eo?YKk~(u4qFhR zF4i^8SV5$GYUv|9-v*TBQBH6xgj>CmhwgYrXC6u?W!2$9NO(obr?!-Z`6(t}!V|Ga zx!Iq6Kw1e+tB7DYZg;-``!PM8iYg)|>tTBC=Z`rFK@=hMuq5XYpfECzTYvcr1X8SE5f)(cVcbFjVf;P;*GuIm0UVnVSXx2;n0%O| zcSuFd{iO1mH4JoFfb>c*n)O~q)A={!ZTgo9Ix&Z|FCj1LiDWRDfvur*055Yb2)~ah zvt|jD_30M!2;-0Ax3AY6XQJYPY#q(`KNlig#W@AuhDyt)^V2II&!|HFp1X!Y5a<^i z!6tR5fu|LmW%7V<3QJy^~*yNV7%(UK|s5J(}K$%Nj3 zXmw||lYU1^xmi4PQZnVi%7V-fPiAwq7UvUUs7Zw612HSEoHim(z#Z3*q-$g2m3mEZL<%iP+V8*PLkNhJ^)5q6zU~MREyC;VsL@I+Bf|Yl%lkgZz5=xQ} z%25Fmwh_w!%`i!&?y3rKRkNdNe6;@hxY79mWH9mozJIFL*)R2%uHZMXAM8ROVZJcW z4Y~RO2^~pRS1reh$HjA$036c~f7K2v!98o4zLg!f<;Bcd&7}GUoqPIbd~S-VmAps> zcYhq}_8x(=Wk7??@>xoN(_bbL$+IB=0k{JB9nERJx}5WNdEStKp4Vf_;DiA3 zFh`x67|t^;*hF6+4Cwm~q5@%r8)TEJUbJ8HKCnbOujFf>WH+QY@avy5`{lEVjb|Lr zzCy7Z*X_Z88zl7rF*=kWg$y=)69)m>eq5CD8lq~~ZVySHU(2oNe+_flkb;x@y+z={ zr0Q{wF+D$`upTHZnEccp(yd(=o-OVDQ-(c8<4nOXQ`Tlhh#{i%)z-auOPwUBTgVq(EJn8XQ*Q84!JkJ~P{CFh;HYPQ7&15#C7OOPyr8VBvsDNx z9p*~}E-x8%(CY{}y2E`2bb?WA;BGt@5)@w|^x^L;%a5lZwg8L)ZXxL~QdsUqj!LZZ z*yfLO6I=~7zx=Xi@Gr@a%ap9X#z9H~Ov|;hP%Ig&5Udcu=?EE(w40=M#>B1P zUEzPEUcPK0J$sjgW1v?DMkNYna4at-kSY53Yn-%lh!fHL5*+$&lYHPyG6u4TEPKrc z9A9OVAa;To7bwvEa>u`6Vfp1r6y1U?%&S9L$_Rf*ycj4F*fdIOrWEb^>p%H zYiWF*9asxl zH#8uSj&Hsh!x+-y_}VXe;SzT(NAwq;$B481Smb*AC78Yq*oB2uJ^E7wCtA8+w^7>l zSjBa~f0_*Cj1YXj-p$ zY*yf|Ve)8ZbazGYU{1#RBV{X&q!=f~lwbC0H^%bhX6LS*i$4IgLdv~ZOp#7mQNFcb zS0HkAtrs~N8Gs-HnFNGFADG`BAJ?N>Pk#$qysr3;(0Mb;kh&RPcdrKjItBdA3(7pa zYI;;e+nDDubhIY(z@Q75gp`YINB}thVaf8U9?Wpqn@H?oin6!W+E`^4ZaI?OcU^eeRjAqnmpmH$#{&f7YKQIZBxa88plCV#TRZX8Jx0&c6ukyja^x z!r=cMKXJSK>A&CX8)W!~3Eu|a&=paOls@7w?eZt$Rdk!e?K5f0*WXankv`sp`j)z0~F3W(2aH%yWu zBja+H3brmc)1Hgo7YV!t7$%tP$zw!894s{BTv#fJ#3AAv!pV|^V?Ns+yO#7z1sKxX z@Na#0t-t3U>7(P9V(Ai!Okh^}p0W>iLCDF^Dp=fMwLkZ%6?)m%J{X>ZQ}3G||H)!~ zwbQz#$_Z+4x(kJ`bCQ-HAippdxNK4h=19HAUc=BwUul?Bkpsx5CgS$G^QhZeJX#PfEseM?1WRm~fZRR!s(d3-U9ztOFNpu6hx|ix{;t9+;v(EO1EcFj& z@j1FI@`Kbl@jn!PFzV|<`F~_Y+|XLq0Q>*CW>5gb9kS9Fib8--7UA=A+Fi?K9;{^$ z6PUtW*Y>xrri&*G$34GqrDP)d zSL~PJJJZAX!VB>C`F!HawLc`pnR5G0%-$6gfw8EG#|DI^hN8WE*T!SK7#T`OT^my?`nHtFbRUafem-tkhaLx5x{&2A97P? z9-(rnOzz==fq^K>6A2iW7Nz#qL4dx~$CaEK;`;obvj1iy_ZVwPiEb}P`w|%E# z5||b(rkKMG*$v%;fiVO_y+rmZ~dkT{o6Ge;& zE6r{p(iq>-&}8F6#m6E6OBRPWiO1&h6Pc8p%@|x1JGao#y`(Pgy9o#mYZB|l6#Q%- zC3%p)1yhr*WF9K~5egpk;DIv0Zj)p()8H!$>(9i4Modi9AJQRb5ERv2-b4BGKIaE zG+&#GMDZLz2@w=`xbE4ZI&7^zOAa}ZM{tcV4le99dL%S(jigBYXKXbSLlMq1#uN%h zG4nF@z4=*Z4tsSTMoJ8ZMaA>tmr#<`@_l6w0f7xndar*p5aIxT8$l;R!@~Q#@q~@o`+XCvYKny$byn}1GNdQfE)UapM7Yr1BZDriE*$TFeA$8~y3+g2RoLbso+fGA#AGN+Q4|uso-6eZ}1Wd7N z*77x${DKywv+98rtrA~(~lUXJI7MOPqeeVI1NOT^!8IXCU-_nsVAfOu{2vn z|DxPhlZ(vQE)?Ydvf^)=Z36Rr-m=$)XyPWXG+UO2jB@?PQj7fx&9Lx`0&dnCf4QW8 zhWxkvkHiTMgdGX<4IAv!F5hoy&sZYm+;>AFIsmtn{|^6FqYeGo+2Zh!XS9~3sjkvL zYqVbG{+bhPPB`cTt?2rl{@LJ`TH4X6msCdG#C-oMFE=gmvmi}^HOyqBvBd*orawLX zvnoxN+1MY0uo)%{G?6w1tkw*WqkmPNw>?V{m-oo0{K|wU9bBID z2=t5W3nEcBY6Ak<04Hs;Ko}yQ=CO@7i(O{HZ0jKaoZ&HxmP_Mz1V9Z@SMZ0$WqkjP z2;eDbQ`T5`v(%jfz(`UUq2yyL1v)J!H!N}wB|UYojN5J_dS%mCrueT7L+yZ0L8S-)-~qmz6eB&jE!0IO8FYI1eybFdTY*sJU9HzOQ!<3i=m)6 zltb>G0dQk(Q5aDBM^U!Fs`9kvNv1gzFi z(V`XNZv-NYrd4W#DoRYv`E3WF_DjVfu(EQO;~?V%sVW5XH-=(BM)csUsc6=QNe6VQ z?Ri6dlYM8Q2!JzSi_#QR7(Y5=?xV(k1#PR-VNT$a7{e)cNp{8YDJa7RKm%r5CMlmA z{g(GE1i&W4%2XQ%+OogTa{VU#R=$emE|gpnk#o-7jgXlFnUeWU_A%94h!eTMJpUOe zQ!{d3Da2w~1Y*hTmNd1&4A>G+9vQ1gl6MaTmxWA(jAM+ZeDK*m>HgV@BHG|7OMw`k zT9P>l#EC9s^Fp3ho#!cs!sH>5cbWnY#2Lz{2u$H739>2i-b{Woc0Ogo|C|P7iOI4O zKQmJIfe>6s_-9tFS>u0`hjz{V_WC=e1{sU~ny3C5PK!fp5QZqwOzdGpX2CX{0c$fT zo*7F)n&z`IR4DK78vke6c4NBUtgnT_wwAWsP8}!6rWyWsxP=h;%Vbz%#30bfDX2!Lbz$QT@h%(BBDg4?C{b5M$5~xwTz3y)(X8SQIuQw7`AYpG|83HN&u_&!j6rg6ZLq5_08V#h=rj2}ck<$_krJz~p!=O=~;hC*i zK%Yn)VFDETlV&0yZE86fM14Ibw09BlTuYfSdLV zdwK0bGDJ-rz4$%R{IacAF>s- z$Ujeht?$~jtpx-%?O5R_!JsW?-eRBN%(tWweaN=8$%Q_c&FLl(V-JjlG&Z)WfHJMn zr@k_VfUyuswk?Sw*%>n$w%k;DNkHBtx&Tddr5tAu;d|(pA|tujyi`C5cT;oC_S@=y zXQIg*3*wm4VBtr|I-IdS_sMCJ{z?E^{`<{+%f*Av#S%BaGiVrcKkxbYY9|38?Eo{pB8@}p-H;}Z{*)1SSc*Z1_?63KrISH|J zoH^m8$k}2VV*NrfBMQV|68$4{NFu*(5z3C5KcU5>1c+j8o;_?KM(SZ^kF&<|A`MC<2;05s|P}ATM^jXGbQ0eDp7kZ z&TK({FqQo)<+sbuwM1-yMP23+MzWEK=}#+kg8QHOK1?BBT1a88av<80yi$czAg(eu znq7!gtJ)K(pK8%A8e?59qg6v72qkRIWQ!wz4|9Qmc4$@LN*QyTqB4nX@;?gxjw<$C zT}HL~frRj(zmUjw%YG^Jp8`$sfFC{-g0TuUq5Lt*wQgWs$2~DrY80CU`&d^%^Oiusd@1-R1L`jhaygD z*t9Gh+V72gTMc1n;mc@T)1R8&SB8fU=H_)A}0kB{5Hs2iA)XzcNdzAi~Pg=nD zuI(9BS2ZrE_6qd<=Hq7Uq%FiSMnsn#FVB_jp-`)0V7fv3!X`k_I0s*Vb>+7I7XFm< z4|y`$8m3ad(`r!aBn9%n)y4z@^3118BluK$i_5PG)uzyoH&7XV>;;ObYGYF0_k^zl z$DN75cw;3V(-RtUM&bt5OhWY%%`RjC8DSaZz_-ue2bsSDnRWR>PDETk2kOfE&{M+SSE9z*9fKF zl_ll07wg8?5HOdPs?yLKvI;RS>i%c42K8o3Raf|1S1Tf_ALKrfWH?KQD&uS$`&NxM z&63$ZDQt_JE&CZQxNel0M2n@}zYcAbKq1V-q@0NYJCyAmnMz81t`-742+bN6nYGR> zA{+TRE2Y-{n#!BqD=RzWD%W?GUttnviOk&hIt$UWL*ajs%l_p86$66W>P*BgRr>oB zp_EApGm*8tsEN$Tq{c8ik{c;icp#cE;KNR8(?Wi`@a+r6ewd5!l`k&@xq-lbB*dFi zvcMtKBT_?xuke3cHM9{D_~t3k$K)pm`amW%aV=n&UMf|Sf?VAaz9+7pvqwh+Q0YXB zIC0HVIH`oAHHvNBr5~0HbWL66zCJX=V08v_;yiT~vyC$pf60i_thndLjHnQXll<}; z(+c9uEjcx5O2tqXz5+1}HTe7$aOrXsshs&zU0oxj5UV}pJiT(9&QRu?HGPSzX`e!5 z(6KO0IoxhQ4<2K1%SCnD;!6dZa?n{{Y+dZz%|iU6T)w3=U?cgy%+F%fym2FE$Sq52 z3aF^zTe6jI{X@CmXJ{|K;B20_o&)uO5QxI{(vfK6 zTs)3w%o)(XRt7lZ`v;PAp_b(FBbX6K%9Sd|@YLsY5K9?Y4quCjw*D(;05l>scTr=C+?VK);(a8SRb`}K z-Tct!Xe6pBHT`)-2f*+fijoQ>kDxsrBOo(}Og)nx|Eg&(vnm-yzI1?0nbvZdIgq<9 zAT5HJ$8rwz%z|C;LT7c>LQ%H{)kyZR?*yPv1)BSOH}s_5ovS4YSL#6zxy4O+xoxR^ z6o%#^RyPI=CZ*9di=qHHcQ!EPUk1LGu-%u6dilqVaws23<-)Ill#gngX2ho)h_U?ER#MW)S2Kl|c87gN>#2>Rwp%$@8wg*`0F?Cic;Qt~I&sGx!bq&w6_;G${|va| z2pB7chGKNp^|&GgM&=ES#LC$_6V4wu>ye4hEb-sxfXM-GJW%A;^Hc)4UJXAsu*Pht zE!*&u3VwRPXeju}{5Pu_G%8Yg-vIPidlbIF_dwO?<+At-;MC|heb8emGm#fsOl7r! zJ|{X|03$PH>6o4|H3$*jR4&*W7#L5=o1)6*Zo_XbDQwL=rRo3;PdqvoPt)@G5|c#m^7Gggr@Fnd_By}YF7z_wmjxQ(31bB zoGsG!^Vty7W5i!^pyRm{00-bpMi8MdhGs59fB(;%02CZdaw<`**>H6NO|j;5ng)uW z6W+M!UTzMlD&vUO_k^>QMya@S<_6>8wl|^gF6F~?N`bF3x3&1yiv=Aw#KZ-fC4-){GW2T;k3_x>mk;R+CT={ zmi#~Ac%BX)wn2Xl*;K>Nsj+axv3TcPr`l>r;oiK3GlF$s{5zuuZK*n|VI+dWjKX>} zj(_*}>9LB3#Z;mHyj$;ZM1|<&Kk|{UQj`<9r-GLfKl{Yv4EH}{f+QF#i>tn=_{AoL5%uReaS$;RXusxvPzG&bB=OE=2*G*J)k82^FBdzi011l9y}334Rorb3FVpulR&n z;4=wiweutpl^AjZPRhI4P!yaDcE~u!@?=&Ou=Yknt7ibNcwONcU7?ZGG~rvKKJT_W zvi0|6KxBU`+B47CG8d=7p79?v0Ow2u)Ec=`@htM|=r@PJLTL4)G>2KpMC`sQfFs^o zTB|8$bkTCzzXRic?vMAia5#`Y>XEp3rob;O0=)F;*Q5YMli0pq@b@eH9do#vO-;%D zj+9gc^Mb<^&mE60IN`h}$z0orCXY#K_;aZE?Qyu`DGhZj2|GKE<+fwQ_aCYJ9`m}; zODPErBsUMLp&=*n?Yk>(3z-ae>7Sdo(n}(NzkAuC?thPX82YI?^UMoj>WB`e6o)E&PZ!=l!xfz@ClsJ|lh(H8byL|4jg{WE5;g|B*}>SBZ!GcS;sI`VSTV zC*=R7Br>!QAs6lC_HR$|;~;S=L5QI+<|%`k18`#acR$(&4ye=3SvLh55Dqqu*yH~Z z@R_?hrft2LwMYGP;NOxGSQn5c@Z4^1*KQUy9@;Qz z%OR%X*7bQa>N{Xd7qA7q;YrFSXSU{I!NAyIAJ_!8^j5web=nrNOt_|uC_{n75I6)r z0Plf415nNp;aj{&s3pFCAsFm)p80!PyHWtQfe*j~&Ik;|6*dDh;^nwb`<@x|Y*m}U zJW%jZPAofNy@40N9??25<#YmU0iQWgbIQ-ffXd*EZ+Z`WHvV4%yTC_agOeuA?wRQ} z_hAqCSfjrut6BrPOjqhyh7pr@yTAwFgJ`P~Kc9h3&N!T=&_G&i0vga=U=!F;1F!|W z?-|3UirKYtjawmPT~@&9bQq@Yyt;*A&1i6l|xOr$WuzBZ4y71 z#(#(39bi#*z7R`dbrYwmKkG330WTNWVnF$G^bZX`$G|7xqufOvN*2fR zHv?MR59+`<2WXRHJ}ua=(m^4$_dUw5N5A9$4zR)>=IpFj;qB4;`V1T?{YSj7`xZ~d zY8gyD=yP6fTsd%Oz$Wk)VYeUxrwtt@|DRQRn*nn8-yr5R{~Y+%^8@$4j{gsw(V0l{ zDi?e1yq_&TU+K?n;5~4c_Xr1ax-AsbdB)dF#*fqAU8&&(i12@~H^^nLKLNi2KLa0B z05Xo0n&(`QhFGl;-9Lx^SHMreTa&?@<=ZxI&T7d)o(9ucPEY;?`~>_4d{!n0K$q8+ zCE|ex;)FRz{{!#~-|x}iMXK+B z{{X)NpS4#z=X8xJo(gS9&MV_8PJs`=E8tb_R>_Q4zAVa213ik7GIIU|{0Y1U-WUd4 zdsSCnZe(kFWeOYuZ-8HbKgX|?bqC_-coPErZmWzqbM*fLyi)A;9|dzz-!>war`HQ zz`R^0Ig)gqGoahR@4!#MpC-Ug*1MG6mh7A+vK)Ti13v@50$Vlww8(!OD&%C?dXt^@ z`vClh@>@A@sSLT9m648t^ahS7zrO*$*6c5CTDw^|eU$#op*2&`!Jt{sa7F2H5!T^j}N7p9YV= z^cg7p1^fd10qoW2Z*wBiH0wSokRrnWgA_X3{|E3B@EX{!36SXkQzPc}4jDY}0)GKN z1FwN?3qe3Zv!N*8GVS{lwfDJdAIIpnT&0o8KMm;qZj*bjYgZbkv^BQ-mO9Jm zd$~a657NJ*0+?~aN{8xZ#El8$BA!j)SKuezJ$rvBGa{;E?vfb30Jdm-{;XYzX*zI< zBxon*h_#;`;4SbP_#N2M8>KQoVG{cuY%bRS6}8`GjsJm;I1{gXB$4|k;CJ8`;CEoJ z#=nB)lBNu0sim;?fIoqsfLFk0#gFSd?#PtA#1|mveS9SU-_-Dv%8}rvq}x+*|4)G} z;CJ9x;LjRuMVas8(ElHzPcyV-bxxJekUzWx z&X_lT1N;R1I`LBWfdt%IH0U#Ly-n-&YN7&6{nYz_$#knQd?I>(1GZ}Jb3u13_Ho-| zyW_c7uU8W<1s_Pc08+v2OX}|(@EUjxysrV+PL<^>Us}_q6^@IvzG%;%sKDzABxAZ|JK7ANW+ME zFK1l;FQk70|3euRX3zJye|sdsg+Fx%CKYvSs>hypnY-VKpHCC%*M5cCnK73+_!IaU z_!;X=53EzTFRE{k zNwU|#Z{y*E@oF%3mdHWhC7s^_zXPvJi8O{CvE5^I>a z;O%X@MDHtF&-(idR)BJ8Byla;M_m7(z#qUn;AouxL$)SoezWm8^X88J-+`^`_&@Rw z&y1wxx(&SL^Y16ppGyTlm6u!C%zx9t1y%PIn;v(755Ql*Yv8Ym^joUPDlR=X2<+l1~xuz`7IsQBS^%tV`44y=56G;#eS-~jlGx%vfG(+a+P3-|zhti?V`!U^z>o%MUX z@}i&`aUQ^X;ADLF0R+GWT|;L#)_6rmpB~6NYX5ZndB^`k0;gpbV=lP(_q@7I_F_!0N+aKv%H~lkr+7VL4$qZXd-&-(*#y|KkYQng`Kfcd;hxKSNa!$ z6}e-nVE=@5?|qs7OQgRm0GQ(iS#!jn^^dt$<&pR6@Ca(~ zcC1xkk;d?pz5iwJ{mt|*^1`-h7F1m*=|YJw<7b)|4leSJKdz%!TIuAk$5S##c6R1w z`WJX%*_<-~&t+OwAIPfK!a}H)p9FGchUJ5YJZ3R0f2Br+AgRG>N>lQQcGf)y0t7`3 zcS=I!e?=hSIQ#3bd4EupX~dRXITBC-SHpM4aj!pV$`uz)o?STij=PW3FJBSM^{*{j z&r4>+vI#Hu=@0n4^}}Vo=Od;Tj0Iojzp>D&*@~aR z*YLl?cA^IPZ$~}9jGvetNo|H7mX7OH`+p;Tnj8|}79G;_@G~xjL52UOxW6f1ZUe)< zfS-`}8yWsHrek|R`bGSNB)%&Kut)riZdkwjVZuAFrWj3Jk^a6%|M(UVKP~@~4Lkqz z4gBx&mYal$tQ+w!UuweFruWq2+OwLw#R7)U&epJ zj<6}tP`LOqetiAes+s}=bS1rb9d^gLSngyfDcSU!o{#;>J$gFnYu|4@VCmm@zSe`* z|JL>E0mz|v-QV_pJ3=%38~oQZfL3AnJM@p_fYdiXZ*|D8+J8)ffaRsW;r-Ugl`)_C zclO`T52fFVU&v{I>~HL!Zl0ExzK#Bf@BilgSN#2*=WYK}zr6K@kG@|^`QOa{KrTv2 zW%Adz(Qo`8js3zGo`*E@RM(-8WrazMP{~ZNy0{XT4r}W!7k-uYLVo`AE-&1hbE`q;-AJr~@M?u(86|ms>%??T^ zKE~fPFoE<6eH;G+fnfGG^auRszj^=FK^py^aS)5Y_dNVg_iG*e!25p*Q9C^GlW~^j zurg9NhNzKTE8dU+OTAJJeMN}cln^&!h(%ORdM%uX}K(w8V} z^z|v4uHCBalkf%l8;^V;+!v~Xj**ULfk%69M_}>&simBi;?PH{x?{w4sQd0$SEDgF`)&;{e`FQ*6+9-5hAg|*M&LSKuYHt}aEj*G8P zkwqeZ>Vw^wB9Je&{775m_wuV!l#-&EksQ4ZSL%Na{Q-}9*s+U%y-K|bKR3R=hM$2n z4Q4kzPy93_MH+F!$wfJc=Ii9QCAatvh527j5jXg-HQZm3dHuhUBF+W>!>>${HJIgW zN|x(tBL3?UKoeNvdA}J}XwO;aIR>u2SY()Ek!pbz4p(71V3Fc$i#&6bfs{q!-7gk7 zn=CRdvqCgvk=6#%eS!XY*0~mW?D~>LUiTI04bQtG?J}^!E|-vX)+5en>)r5tlNHi= z=IE{2^O*B|zEGrGV1;RcTj(lmJ1kOvf&U#~8CV16d1NqSqsv~sNI3S1fhAxASY?eM zWrh2I6BWPA{}mQlU6KES75;;Ii^!NHIQ(q@8$y4NMal!6urcwINRfLUXv>6*&%o~2 z6xrr|{DWsqcR8Zv3;Q?6_0Lj(!Pc8E7YSRyDzE`8ap1#{;M>;$bvNT@9asaFfG(T9 zc3EUS{>mniIi{H%irq2QtuM+?Lit_c0dR{O5VAaE-OAg*>2-mKEAih29s&2+rj)YB)D5$7dvGle=Yc!G zL*OpETv}{%b8TkL+LR6bw1ExaK5&l;pv6-r?x4*_V5@dSWE_2W_{Z&GAk*C50m*fZ zPL3rrz-{0GpYO6p(F68?_dpZ)bOZgjfCs=mcBw?HkUJKBWSik+{wKgPa2L1>tP?}A z3=?_B=Zov;p8{?H_kah$l4PMS*=hOV1{>A@R)L4WW8e-j0|e|bbUlunoKiBGe%IpQ z-hUfdVw>+d+YC2>PhOkf*q1B<4}hnn--`76B;h?fT_=28f6_hRK5&N$5F7^e7WfPJ z1RUMK|3%;)@QCNY_BQ}MsSp;>{9`09}UYkkOBV1x3z0?Yt0@w3G?%l+%{ z@AT(=;6AWU12B+-s_%ghH^{H!zuW%}>c5<-`+$Rfc5dMRJa8Mh3*4suiDkgv2jZvF zUzGt!fK}i*@B+9CEYpMN)BXG;U1Jj%;2Nav0MCKvz(Z*hJflNB$Hu+M&30439pEMK z9JmK8$f?Uyb`HmSU6OucU>SGQhK zga$<@ueV4lZDCgW!YxC?v_{6PFQ7=rHuyTBq5`FUK1VqgJyNbugG0*Tl*K2HTS zxQ_p`zysp1fFs1sLc&o zlD1_#K3tdnd%#QJ2ZkRVy6-d0=YenjJkG!=;4bij+G|k;Z{^fJ4auj~ROyVhUlF&ljv`>`&zXb8g-uJMEhE%sccxE`j$G`G1S+ ze?oN^Q)$gGomsGDu&x6f{vQG_fM>ueXNM)+LdX1kBK>AP9ey6ON#82i4$c-(ei%34 zZEBAnffvArnEM$S<5VyiU#CB$`|3n!iPo<{z1J0grdN}nfZF>9;5oxDO#z+p(3;c! z@5kxy2>u_6_Xz5r4ySltPQ*_PtkHlt71(7RHl;YV=u*`2@91~l*Z06}Dle^JEK1Aj z`|ITQ2_2{vX%ij*Yrq*W!;JRTMEp4U{SJ7-2&hR1w8w0~CF>E#w(JVMPbqzdUKf83 z8GL56W|y@h?)Q9j}fk(k2Z7XP`XgI)6BI~vd_ z0=r8;%4+J?_FiGI1>6Fjk$-Ogz==&M3;G$8k>}%y2>1S{z!RpxI>cYfL6U7C1~a0^jHo>Ywgfz+`@Sxh^~Hoh zn+oWBq78S+Be(wBtN@x0Ysv)tK5N>S* z$iq*<)))=zeN46Rn0*{%z2>c5hoau2HMwg7+*F=hozwa}7G*5p$BmXR! ze$HyrvC=;a+y)+U-hYR_zhS7$VJ26sMxSaXvkR;<`+JMVK9B~s2C%@ZeGYiTQ>6gi z0quvtQ{W-6Ow(Z<9TB&~!mk5u2ypy=#`n)tV}(>hj+S+f;MfDs)QM{W%Pc+IqWkCk z<$^7l2?M%)Uf6P~^gH+GDR768W+9V+JH(=mFWcu0VF2;(=-;5yZOHu87O=>MhR+<9 z@8RFMkB@*`>>V(81^%H&tESiKe+WDP)|kAEiM)a+UZnQf)B6UD|4)Dil;6O=_Q+h> z6F%%!;-*+Vctrek{FcTpsVL;ezRHGM^}|8f@NWZnn(PR?;F0+w)8XbmJMgXW){rJ& z7_z*u&Y26lU*;_!gc^ger9JpMnS_Fd(b8wI5>=6?j7MYE>{U>3M5 z{BIF_kzD)S;y9aaV5f#3qyG_o_qH_PN3?_iUB3gKT`oTsK0jvoYNs?K@dOuv+w{R) zRTmL}Ch!3GC-5V1U!q+QLxmjl3Ta1!;fPVl_7<|S4&{b>3 z%UKdQPywC+9s&QL`_Yxyr;tm-tS$eNx76is5AFfqGxnKf_hQEJdB(+nLBL6k|Es`D z;9m@1n$mvV;~>|DpV2$fAZQMF0{jE`4!A=Hz@QV#7KUQR%Arw-@ZVXFHh<5a$vJ$| zI--u%fXxg4);K4`UT)4vNQdx**T$Bg1>gnnzX0De`BcdGIy-R)sc9eB(4GeA{|E4Z z(ih1=BhZykSdZG%8|NH;UI70Fo&zg#Nlhr4DDp>N9V-2Ez*EYPWsmKwBIE{A7fqh( z*ntLcANU9GBV(^b=JlHnk9`Apr1Wn9FM)pmPk}WSYXiTnK42Corw3appc&vHs|UB) z{07mU4GCh0Y%3|NxlP~^_0Ju8KM4&$z`e}q#$C~XRP^5>{f{K}F2;stht!0psEk41vgdN}v@4ioKa@&k%qmmd5>{$}}U zO6%dQ%u<}ezoX_WxFL-%!7NH77k!v)J+|0=VZm;%oHB3#vEmKi%F^ynKSHt zos!dUBN<(6-h`QswqxL*n;@-(oVnz-dgC0Iag_OQp{gxY>|I3{XYCYusO_P6=f2e=4^%J5}p8`?d;j zgKA}&{13$UiNt-62!SPO(!>;}>#6q7T?mH0vAtSM3%QU^hT#mZv-_{H(Cg%MNY?V}s{iF2VUAOEvnPNUqX)<_!6BF`y0c zVK!LgE9qa5vlneSKn=NkC}1whZT5=Euj!vz`BlY#%22q;QxdCmv`YoLOp%C1TEPy4 z5m{*Tuc`h?fEj603?=G`L>xnY#k@@OEAc9jz zOPm*o7@I@l_HdS6p@?@TUE}`#!+ruHR#am=k zk88^D<}w9iOoynUkbAhmFtHS%dFu3OCXh2>K|}13?cuC?aLYV+(eOhKNFvDScPB;M z`v;~#m*sgini#{tBFABNT`yIOVzt0VL*+Jn8yL|BmtEEY3gdrQ_TLmhCd@M@aGCU9 zngLs7>A;v5iRv0L2x*7`?wA3$CZC4ycv%RpX4I|M32Yc3pXT><8GEY4#j*zYWZBylz8$Un?V6I?maHHs3c@!@m_eaxOdLc9>7u z@wL?zi05VYO$K6p3qg^YGzY;-0kwH}Y=ui0@UcQfI1mSJi3T7DQOr;u82$FzOzNO! zm=}4>vgOKH9ggSh`tE;DAEXd}u0dr}(ck2Gg|=+9dM|-|X~g}WGyHWZ&n>YYq1fP9 zx-l%>FmLiZMW14pUs-r^Os9DoL3C}%uB;JmR0*EKnAR)`#tRQyZHqWLn$n96) zbC5fr8gjdB)1{F+M1M;_YhD)!CKIU+@Ah`Js-htRnTX?B_^!MKf(<$A9w$hrC=HuKR@;rnzA>d+35G%%Q! zt5P+53gjywIcVocPH`H0HJdRR)BgHJrc7EKX+EX&&x!Rgo)zpLOXMcFmKcDzfpnTT zWwvCi)N8d*NMaYLK(0)F=h&*H@lqkX2;U8vQ3E_r2cpAbGbHd-m!u){kf(tp!LVzK z@pWw>S}3CR2QWv6we1W*n|SDqUG-}~R+qLd5r1U>=A?0BC_bdB5X*+x`GzP&)}cO0 zgkiNL3b-9Ddhn(EhQj~0P!3XI1;|i*^ESIuOP03%#T)Yi$%0x{vr82+12p2;pKJ@c zH^y96_;uKrjg_%ssw&3oD+>9pi1U70K;jI*Gz)Z~ z8ZxHkel91FYvB_2du=N8hRBT)oXLGiAs64~?-xzB3mSl3*cyvUu#{*<9qL$~B*@w_ z+!#{6ibRcTBBCq9d@KI9!~n*8GGXhR?Oez)A=4x|ZOV=DH4n`8(ze6VZzLzq0CXk! zU^$maKnv;Un9>MJ_oFESQ1G>VX-6`qG^iCrAZxN|wo;2y zO5=p#qRmM^O~b$8DfU-u$hUyj)Og$Wt3`Uc6bUt&+Ur_K1n+Ea9Ans$p1;gDmnQIm zyx$al3;e#eT(}&I<^Uyc$n}R}z}+vWiO7Q`uLfL2?C+)RNJ({!tVQ=b@%5CctxVkL z7Txtyeq(~kAZUxvglC1A)PVZ04A)H=#8= z9>kDhiLovay3L?#GT1obUO8-VbXrryCGZVEC>Ih2EXcHhIHvZs%u`A#0^fR>0Z632 zS<*i(I#|8N!sllwEnQZtEn=zHEG7F2$ny;ofTs9dP=#T>Z4n2hDqF1OhFk7#y zHh-xmE(AQzX2+34H$%yr*R;QWQN(F&4G74aRD+s?$VCShT)cVAl>ul|2e-J});X7t z?Iol~EKV?J(}B_aUBL}c{YJh>z*oty)(S()GDIEZ!W3hf<5U*Y8khml8sJD05{3@Z zpMiLXk*~&bQAuXj1Njre2VG79&cf)JZp3t58PCMATKZj8U@nolk;xQf6q|XrQQdRpG@)PsttSo38^;#6Q*Ao8XYe`XiFak!Q=#Pkorooszknd8h zUV_RV4#sq4&4DcB>&!4zd5MHN+29eA1sUO#2uCenQe`xnbeB@U0lt<3vP4kg^iPY* z$l4wu3fLs7)|?R4@NX-R=Q7nN1QNT=8Hn%TOf$feOsf&G3yczA&Lrwdn)mpg>acLrcQsz;zlfs zEPNXP8h+Dl;kclr637)YgB-FwDgzZ2spwkx2`|(Brduky#$7)9ZOeEU!eESxq#{G3 zRfE21@{#|1RVY*In~E~mbWp~1)vO<V++RTm-FV0BW1!DO&rh`r+nvo8E|P{I}G5 zCKXJJ2}azVSbjg$|1ukz%fpe-rK!!j2&o#2W!VjpY6D0^-Wu38q`r6tK1eL1q!tDy ze3tnda)-l71SUSxs*Tr#1B7_jBNEkqTb7dnMIg)zgWM%kE!QtLpU8Bzd-F)P zHuE(_eG6+96VrlfUIS6XArM$fIu=W&TWw0wiB>8-J{2R?*U^fhB@~n?VQqudECd36 zrTN-VoXc>voFW$Z8Hteeq$2dCNyx;^WRiTzBz2|nYtGa7JsU5ct11y;sI@`WEJP^) z8Sk4>jzPXrvucG;kwT$k3n-*Yg{w|OpOI2*;`BdsEV(t1XGMzM!iw%{L(!$?FX^Y-aJ3^$YS3K z)yP?cpiC7&CVrJts4`HCGAv@s)vRW+h5FTlNo^6iGH>LQ9s0V$jES{I<*JM{L!%D5 zW1YU)&jC$_(jsHWD^1xrS2E{qAP<1DiE2?5&#cSZF&RHPL zce3SyNfUNvLo}}#60bwU&ww&%nnC@j5r3Wtek|fEIdRU}P;HSr-3W*d%Qz5qVEpgX zN3TTl|`tK+A~ADQ)x1;wFg zCCju8fZT+q@Xt?0KF`z=c#t<@hH`V9eMP`hzR{nO`4#_1{JZ7J%wro#3alo7?)fwo z@F?0NQX9*OM5`}fnaG&@4=KNeP@&n3jP$29`K^%<`l`|zW6ji#gauhVlKE~U`SnON zc|y6d1Wy$&76jmEEWfN)H}jpiq1-`UQ8Q$-Ec_aPIZly?fqNt!akx{D3Ft)p6-e-0 zh$S0V7uZt(uIK@29AQciq7{am##bIiA8|gG`%%mH_u-pgGv6f!MpU2%Aj6RPUXjoO zxuB~8(3ParYJU`r=0Wx4k!n7dO^r}nVaY$p3x;+<*pRnf4a8S~sJ@!-k;*)>fiXip zi&~H#`o`Qg|H|$u?92D5>k$c%10RmI<>^Q+yqcXP*S zMD<<(r|$jWfY(d00T5?-#7J3H(g|?lP%{$Oqes}=!`5}s;3`aER`}@=xh8Ff|IoK& zg&6Ql!_NS? zdHCR#fcq@{E8MGnRSAFdTK}nqsH<328YN{{Ru4`sZV zPtgp5^Qh<)H@kiDBCr|#NbxKkFMaLRa`WNLLsoy)Eq+ml7mJI&*3pJuam*)C?!-Y4 zNopg$&{b3wx+$?%kn#x8oyhs=GkEB!esnd8ECAE{Kt|wBD|uYUjui>2#H+6Dgy=$o z#>>mu2lj*C-kh=V<71!hs;v9fgp&uJJ5g_l8{Cq;uLBxbz%$`kl zQ}vH2INuap>NH*E`dXZvR;lsDbI3^i&y^QLeaHWqjmmQq`3z2G9EV~W?Cz)A2_4h6 zv98SSG>f!KmPL{FNZk-N$d!x))Tbu7{IKr(@ai3SM~4h%wvPOrywIkV32#^|9(Y&? zvOS5@g+U%A!wpzBle=y1VMFPdHG>?Q-~W@#-AZH8|9Yy8rl_L%(QBzrulgxL?U$p} zJn1vxIum{Wozu8(_!&~<)bU~UOGVzO=$6&*qL@~p{Pf4h>kMSo!>|nUyi@3~N}p`O zPiahEHUzvjch^TudB5i6tL{|4PD|aaB%p2cH|$ENn>CxA;chz)VbIu`DrxaJ9^dU7E66SAbCvj0 z0~YxRFHZLo)qD`TMCDOh-{=^{&)j%io%?zUXsL%M>a?;!MatVq;hN&oopC z6aC!s#_kHqvi((fDcC;D(%A8jfbzY-!nLCD<7wnD6Q7x7?VqTNU-*R7@-g)IZk`B` zlFCKxLANe(bTs)?OTDn_1xjUIr-Q5V%v1mtjsco9_gGO3vGq!_>a(4aAKQcrR46J8u^ zc)#r|vY>L2*5E*NLvo0Qu7hS3u|;IM`s(kvNqF;m%bIU4ME@f+r&}SKz|JkNCt|DV zA`OaUKlgA^HzJ^r>pyHLRZR>_w~=_gmsUbs#I=9}bF`RLIiO7UmHJ#5mI_)6#J6F~ z(QD>(anDUT2T6H}nsAw`Gl95lK?be@(==5l}gvlx==LJ>zyBCKAA@QUd*%sIAl#%-`kRl)_woQ3`LNw zNzpP3cstZn%5tYxr^{0GH%gDr?>GMpGOvL}uh1aPBJnvg9lA?mK+^w0j!0z*q~S&$ zaRq3e3Yy)gfBEc2B1(@@pUtY79LDgyQTI;>t_qva$lmq~t-O_2HAu~*+C2{#H^B^_ z1pGF=7*7KS zZhukGs`R#rhuS}Vn<+K~m%~>E`@%+10Uj?6MazhQt zl6PW|;23ZWzVcqx8-56@!emYG2UyOSnFGlc{uSGbNpAe%PBWFKx0rlf5l7K2-wE4F z@_eENJ|VZEzbal&<_h|NaE0!KXM}gF^NQbAx!cHi2(=OihEvo^Zyi@6_(&?p|$dsO3k-Ppx zd90r@*FEWeoXVl{yAK#dYPgkl-`mWoUpN+E^95=*7vyGD6O(rKQW;J-{O+|ufUPG` zeiWmfCn_9Rgn*@!-xl##)?4o2DA#pfd7Quk!M+Q37g*`y|{;lkPNP-vwtoYr!eyFs=7q0xs*lb}wykj^Hq%w1o zK8Ou<;jQ~TyNjgw3{|?x72goN@D4NP4_d_mkVy}&ytKnr_V0kHYRgMpGB9)iwz3uT zjoh<8Cu9(7EgOaBg;xW&hanH7N&4*mNIP^KI~ZB+EP%0iF?cZ$Wp}plfyrU{VKtKL z5q%`Y4qOAHXjG6gYtnDP6pgV55uyXI#puJopzJ^juj<4^g1YCE(JLks$O^jL3_TSE zGq%T~j-|?%@Mhq%MLHJdKW37@7D53RS|F|)W90vs7s#y?mWiP3u4s%`NjUrrxDdO- z532z&KOrcU+v?P&A?v=B!6Lsc^?%a8ZIy!bq`hZ86Rw`=fWl?~;(fkwdKQkM`bC)K zai?*0yjrGq{Dog{!A5^dIOI0*O6l9FO3L}JYsbH+K70g@@#;$)^b1l0pZVST%hd&{ z6RI$LfH%PMgUDb#1eP7oE8oSbzDT52`AOJEobHqKes}U^s zT0Yv^SW5XYrNspm<2XOEX(l<@@1VXjPr5WozIF5Ll6_ zm5SkSX(YD1TbP>VIWIMaV~E?>WK?6Bi)*J4F4g3)OCViOcS0ch_^8uY^4TH6G?*EJ z0DzYnfJ!M{CX}iEO+9W0iTR_v8w$DriE_qCnai&Ad^%oh0FwMM!*@g1Q9ByF&L zkz$CwzTVcL<6}l1mF&s6eA=B!xMHk}aDnELvwrVApqwp1T((F)h2Quj2lXCcgQ?m%OS%(Uf({kT>F3<2;;buCPHL-;4$DTraBR4H4*KSq_ z`50}}RzvQ}%?u|@DQo=e31iA-HyI^B+R+QkYeLv;_Y1x|W3_kogCpxo-OH@5I1QYJNK z9eGWzbn#8$MkNIY{jiN-KPp$dOj|NmSj3_Fa75^ZF|pfpz39J(xNiybHQLB$mD&ek zL3C-&GUv$SP-V;CW8lg2W1m`V;m?6T(MOX(hXz;YOeYtvH^ch|b46jAH!RrYsWY}F z+=Uh-!Z)k(SON26W1-i7AL3zisK8RYbeb|f({Kr-(BhznEn=7>5F1?O;j8p(->#W; zgnC1PflC|eDkgjfiAR8=r#d1Y3+X_!1GN&q( zlZ~X3I|_&2$C@n-?D+45|FG1Hc#*%|%wA!=#90Ashmp^lb8F^1avV9ddRh(+6vK(- z?mZ8yCf4}2#pDUxtx`c z;Am$vDFwM;`_Y#V z7kfkO<1(NjbFQM-!&2De(*Y{)r6reh0A&f>TjAgs`>Y%&ABFg}myS4o^27FL>wcUT z_2}(k<1HF~^ZmMa9wVI^ zvpr7%i;fV%vdsyCLAv|fN5eoAs<=2LwOR0A zd2-v)cmg7jpgg{9U8Gc#^?T%*)lEKXr$c6RVM#~PC8#5>+X&5;vfBFeh1}haMo7)_ z8w3sB_1pbqBDhl1jd6zw2ZrtYGe3fz3AplxTG?doX`uS6{6G1h!E#$d=y;j^QL|5g z9cf}Ja`H4;qfVVWS?n~oFXzP!s_DY+Iw3lo`{~BtyfVh zm&w!W)M7jrp*tqdcVF~_cW$F6pM60+$f>MNUBa|YAo_IW)}dnX6(Ys*uL()G%!NBV zTt>_{ZGx^>7=i1W_s#JVC4S;K-czwLN5KDotVn2w}eDI+7 zV+{2?@0q=2_OshiF7O4#J80Mye!K3goK8*RPl`rof6?)2>eESGb@8wIG0D9V6HREC$4$w{>Q+lnKyb)cYG-$K+EcWcmAz_qIeBq@C_|9iDFkJKy|D1onOu0 zo#Xn+nYLKZMfLdYHIV#oYkHDLp|-f4L4uKOXkQ4;j>EL?XB^YD3hzPc2R)3!l`rwt zccdCrgA{WD(BB!ZwyQYvW8^9Bcm+kOc~f;p*y(#EbHa8-u;qJ(lK-UxO~zWtH)$`Q z+n-`RrH`$5lHvEj?QU<^zmv`-G7BOK(r8FhPP5Hto>}~xbd3_{;aJAkkX*!Uy_)%H z(w~q>#n$d1vtWd7)!<`UCdm>VhIYQ-w!e~OL>LG5t?~5g*+qD@7huUF3#?<_t)>0W zewVBFGW8z)8b!9tkDo27FOGftdGc~L_}Xj3E~AF7LiA}>lQ9O!tC*8j{sLL1fb)^w z*7F{OFEMcMm*cwgy$GlI>0}Xkle2Ht2vB=d&J6>1&a?t94ts0^BLXf9~8o4MF&Z_V|_^Xk|XYDcAF{^Qgv z9VI-b`GIZLqjmI!;P11ne_W~Kq_yOg^#kPh`6(=%TfQ#KT+p#N&{=>pVU~q|UL6;a zz8$EJT&{{K7&V~*LGH_4eq($xGsXtg*or5Uj-N&<$@y#dAp|9SX^%z97> z3wgG_FCZ5gf{=7@ii$8aaQ<5bzLqM`vO5iyS^4V@bMm~)TH+Io+& z^#lmv+yF3y$^taM2vv9=69BoxrS$s|WK4w<N+4yo5k}@FA7gPZvIp+tb@tyo z{~NWNXISD++>=<`{x!1ua`f?)@I9z~hY5)VC=H3e4Ia3P!QA&wim&E~zA3Fn`H zj>z9GJ73dbMNU@b9@PXsLPayiJzHc)4t500mC&xE^4xA2v)3{{d3`1$fyn{U{Cfal zFb>(Te-=ja@_89;Py1&gFHkZ};X7R8-H7zq>HJVnK{Ct*z#g)^pI7S!{Bi2a?I3ix zCHQ|Qgp$vSqIZhG&rv+KLE~|~Q5ksf3Wp!XENBa$UtADTS!N-;lL!#NO*rIxM@$NS z2hPW(9xVvg6ag9($kB;)X*Wc{p={R(V-N9p0?V3p3GB(N7n-m2yhh$z1WUi=po4ik z5MMoR$}#$^a&HU10A&C3<=Z&8>|B39&1n$6z-`Aykqt$(Es5n-Vat_ ziSfp4&_|=X_9DnoSgWR57Qsq}_bdOu{7OVB-#S<(RC+msla2VQjsEQb2=>h}hytdt z7*%WVGG6L7cO&${4-&LXqyk;kdojAYEet?2WR(yJ)1qL)J8Fk>d8`bKZiV&VD4xvmn{~~Fjxwel znd@I9m5_7Q1uf7B#HvVROa3Qc)NfKhiYwMD)*pmJ^#jV>mszu6*Y1k>4imR5Cn@t^ z4o7D_>K){Z<|KigCjNgKjav2YZ}Xmk#MT;>4 zV2Bc+nPB`aqM))r(v z!6-I^_7|nXC1C-FhrFK=brktQt!IchmZ^J`21r$NoXwOe=eTMgpyjbB!h$h)UOF$F zwYFzF5l3?ZB?%*lB2)xzT{nTfcQ~Wh4o_YC-<(}SP8cj)uyrePIdUx*dybrUgB@He z_5OklxhDlRvU30PHs z7F^a_z$3)#xQK5m-!1%Q%uV?#IRF6>oj^_?wb}$fDQ9=PoxiE;{eBHIBk#lnIv)wsXdM^?k^MUk)7IR<8EOB1U0u4FxSjEkSP$=X@X2~AU)BMCurPd= zlXhgwhctn+!T1(mS-4J#h<_1ZflU#saES6kHwELdu>o>NWB$dbGnh>1~J0MfLI^TXjgtI01+W~29B(&#(l4l z26r#yFai!lJEnOfJ*OzVX-%m27y(^n5kRu;e=SuJx`;F;bj%@k)6|mI(oC|Tt6WD} zj%J}*%V77!w%{4~;{?BooLcNEOxZ)>Wq?XX`8AuUnTyn47{AnLTm#%Xh|T8A(MtI6 zQ8}qZMBvfle;MGxAQi-fr6`HA^L9wOgb0NvS^>0JbnBn1mCav0w*_;Fy3W~4kp?_1 z1O=(LWbV!$rfI}mj!7PPw18y>vLd(JO#JoM;`iA{83BQ>Mo7vV&dxw3~pU(Fk%g_3Jq=a2&BaKG=|8r*47F#PF z(zaOPV2hp>F;<+l@?CS(kLDk#73j7|Ofo}^GRIBU%Nw(HI-gYuA!8>vqH+iEaHKT_)XiMJP=E+|sb=Xt2={RC856x8s>=~EE_-3lY&m&mio zWHid@9?f?y=M8#Elky(Em2Il%#7s0p6GIHzqF){$ISltV7mwXrzRm}uf4scN!yv!% z`}N|sCE4F=8f#PSjNPeh{MQ1&3R~-|M-y$QPnB*KxxaMVe+_F7^WLN-u z(SECW{@lt9Jp)#m1AG6as%e}r*zBfEimm$;M>I*l-)+)$=leLnv;k`^1@T$Eb#h2t zc6!Uea!sD4BDd1E`zUqwdv7wVms-&^fXu)9i9;!b;xEYJtOsA@y*RpQWjtONn1n{M zyrSES?4v+~#Vk$vh?{^H-3fRdEhZ+}Q1{b9!y%R=MjYE|NIFYo%#$6Y)^FR#Fvn9@ z8}VSB8ZU?_nHPuxPPh^C8tgVwt!>U+{|e)uh}*x{K@+jk^n!{+MJOq8w)$%-=xuv% z2c*J|`;9IF>OBr)JBz5%zwrT6?6k`6qc+(3t^^etBZ}oau5cO48Xc}p7FPO@0_~-> z&cU;KL#ztEkJv3^Kw98?a?Q>U(`t#hzz*Bb(AQ1kMdZPebDzPpPErf&xUaIJ0u2TH zRZrPvM3OB&mVeT>*;_b;#;!UEvdo7ZeAdB45cuhzKq)Lr>scq__a;c=0)8(RO&rs% zaIM@Q{UIG2@Y)GcrcKdvdj2%ccdKJFPix=ivjk9bwdRZiZYRMM@sC8qU02Pzr$3SY z`0asBl^++z1QYCV)T)O!%bRU`?pTSYSJWvf}tH%R8{w19ABP(WW=9; z=C?8$gcoAzhC~fguOfAc_pFS-^cF(s0SW) z^|rdM9IlIQd4`U3Rh|I=O%(>rNMBp{mG>^(#xfMJWB?i+C>T{A^2zXLz}VfzV;>g5 zT_^}c0x&6*7VI_$g?W~9sz+@~PjAfO^rXu4-AB$7oz`R2AFi2}Y zF5iI?1B``IEyvvXDgOLHs5&I6V z8)YNt*~>n)ca)e&vVZFe@%XZ{{PVG3NOywWUDkVzR#jo9^d_OjDM)-d(eVHZDa-2)io3O`_0iCh7UqAjBs)e$3GQ~V5c>qO#e zug7Jaqf&FI76whLf�EU(^!Js=vyuJIim&i9j|f_r3f(mWv9B&ZWVe| z1;7;J4e4^W2L6EF1i4pol|MD~1cp{;75cPGS+KWHTGlA~zC~HqSOZxep)}0eaGv0m zox+w?t)4&8k73aoN!)%%G~gH0L`j|EWXJuDPoh_(4Kz!x5pLUkc^{%Ex^Lbv_bCwq z!n)}l1VkF9V3Ef*-Gqu8ck`C2*OkMbT zN>NZU7Xki{t=vbz$u$}8Xa>GVhx!C^h&4Ga=JE@f@uhw-2je~4^lgdzFl%(0{!Q*8 zPHGMpb+F9K2YKI7<&TlYqB2L4Q&##n<{!*(B8e9>d1unF3dI9si1*8e|JhyE>Oe^> z2#yA@GTeuMGfr_r4gi$On)@zh<|azV16dkf^?~yAKjgUm8FEm_(q-A^E>3l^kT({v zXPEZmcuX6+|1FF!r(9D0xMnl(nzVe7g$mxa8Q6pcXsf;Z@1oVP{y7FIwDMl_O5(|H zp$X_WUZlZ6UJh?F6EllX|Cbb@n{NlH9v{}-EZbzP)rHM>z7`uY;ml`SBn3)m+Okxo zeO)+l+a>>g3EiJ$XP_~t*Bme<15Ugj{Hlpl@mUhIG&`xUyz;lPb*dUn8q0xDpGS== z=Du<3*|m4s>hn7M@5SalP0}*kFT2#=3}52C|2$cB&QW=wiysGLu-X)C4KVSM__v6z z@gVziA%`E3JPjW$5Jw7vG$OED;?{F=?{BdkQM>wCtbG>N_dahNr z#}TN@&SVPD1`F*Tg95$se3D|B<-9G@&EHjhAN~FmM)aalt&uhX+(aqwXy=-QsUbZP zAD^V!VTi~zjBk8+=vmnhmh==5XBEr3CH4{$BY_BXXp^asbeouyPk$qCVE?Yc^pwAl zymqLsRj9ygKGjxur4YwL)6k zCuz_Ro%z|yjuCMdeWe%hUl3fXbT{o2*fO(+a2+W(E`>F}ayJ#qK4!aJXPip#df%8W zbz=^bnn0b1EY^J_M8C-U_G}gTr=ICt^4C^S1A~#|4|)mOrUBUAT)cb#+{-H_u?htv z-=fb*gVgWJ3Y6EWnlLf4KH^ilAgFMkxX#38?MrhxDn6}+vE5lMwpdilvJ;v|LQ_}Uvt99$51@V zIJ5!j9S34&Bi>sti%)9Hl2eV|!U6TN>(b(kpN15`M$R+Tv~b^l)JW&?W?4{Iela>P z9+ITQk3u#cmhX+c@dn3>3Y}(a4@x^2KTU37et;tk$R8k0whSlMC41|WodY#^5jB@RrnSDyF5|E}^hP&flOxdh=Wx+RYACZS_v` zG9Htp!#1oxi6_$#O0{!g{eepzQ*k1w)QDB&D(gLTs)ksMNyR)Pe-m9&qBIou>46M8 zcb@jNKL$~xR`7AiO)L}H*PqI`)+^CLP+`GP`k~fl9xZz}so3PatZ{UUZAvB58XITA z|3Ct0QN4>s4Ap!+pPiyMQ*)bUZ!9FkHQ1+$@hTNpcu$?q*mT!apDbqNie(wblwDz~ zlbarUsj{OKfQ1+GWFuoU@QiqGs+UI}dy}|B5-xD<&SOh5yZArXa+TMr500z?3@inq z)K6kudmdpFuB}g)e9z9-XzB1%xtX3m_UVx9<+7@i{-=~}!Cles*Fe@SR>OF3)9%K5 zt{{#gLU39EW|dLtNC}XK(o3)(D_6NsU*DjadUerATXLy}m{gUvlqde_7E%=$$r#}+3jeNu>pSzFAPJbJ{ zG1Xo%S`F>TB{dA=x8up!Js@6;*v@Zd8T9T?RV?fy`&0a;A*KK`kFA)gs;jS)xF7#* z4JSxaQ?ydd6Jx;LQ>^z|2A`XXP~>3@2~jv6Dvn@^K2YkqapQeyisXNU$OgUG{Rj91 zb7ZbcoNRLA*%Azcn*}^rH=e-!Yk2ctzks!}Kdil_d&!K9B6-iKoNdHZor+4Ajh0X1 z5f*q)1t>Y_EHp{8l%&Ptgj)FM+GncTlTX>7fRi-YD??m%>R+mps1pT0w=8#01q8o@$ITVnLY0{Ef0xNkIyZb7=qn99NYQM1^jwaZT9$&EHpkciK0+ZWD9>Zg#!mFAYKzD%PS{2yeQxA zuFwE6_tgLVzQ7v>o4#0Rs-!$={;HwQcro!5GxUU;I%0k+E_Oqg{nmHYYMT2_D4Zi| zzho_=@8604>-2A*oGv#A@YDn#aX}!1*DR0Cn@hX}Q>ut7OsgnLgwjLwZ`}I29&_^4 zFBGNRe-cxgfNLj8=S7J>v2dkk_H6jw@`&8xDtmCv=zZ*38ZWl?dl4fnQ?+mlwAZfL zki$~=;>^?bGkgc7UR+oRS)g;VJkrt8wAd6;w-M`D6F(;yY-ctSlh&N>Y(;T)aq*@L z>CsgryRjK9d(8i4pluNZaX6c=9 z{B(NZb|vO?uz&J~SY(=PXCQdNN=@($R}NGuI^1{?`?-|%J~~%e+O5LRz`Wdk#64PnjRA20876C3_C+I!Y{^&7~S8qYm-;TejAM9{|2L zMQOs%96q0`6femfwpyv`6w`XsN}it(s<&jGN#W^oq6S6pdpwwZw*qecH0&@PxLG7x zvqehyPp zZBGJ|lq9pq+|mZegDeK-HQ74@Tr!O`KWe0!0Cw(xe(j;KLwOnx$Q<;w>YF<8if1PQ zVwXRAx8k1(QbcW7kZ7vfSzsUiqYERwM>YO!U+_)LGmQ%9cg~6E2?v^2a#9tkieBVn z57BbPo%+KCOw~g#5{5P1&J5YJ^?2TrYu#hghjLzU&`CY-OPalJVOrf~V(@+DL4XD2 zXBSD4PD)+>N2@FU1+yIJ-Tu_I_YqFhS9>&%$N5CNP`h+9ouHHBQts^EJ6^nRDpV}B z(k!a~<~_MPvf3zL9jadWk`@0XmI8ex7kKS;?P+AE{YqeRxB&Zf^V!=PH_aT)xb*>( zXFnOMV4WVn8UHvAm5od1|6G2K!Hs7%TkBuT`6;Q#d6$8~Rs{-r$2SDevypWyjs0jCc1w zrCcYb`mIW;A^ePIw0HsP-e2!PZ#0rPC1|+BGxq4C)}wcupKPACEWIlGBR;22rqvdG zL1)a;QW5rZ>tVN3d!l~O^|+Vq4%dV4FZK$a+P^Rk7W7H+xHslW)_Y3-u0cz^z$K=E zdtzfNzLBR~aeGL9=h0A{RkFdeB9cGs)6V>~bv5|+o>kgBR$uC`1E*YX%SuK&e3qKX zR-|2ajS(I|t8?%!nb|cuwqnI@Xs-24X*M1|&EFDS_RERXlnYgv^gb|NVEvaTYy={K!*pa{+-qR_Y zX#s&oKKy0#Xebu%$8#GoU%ayz!wC<%8ILZo9^a3v~arJ3?FC{FY9jK4K`r)|lfQf9hdy~EDSe`IMC5i0Jl=Pcd&H-F>QkD%+u z>1w&!#5598;R!8t4IuhUmO>SY3X}40XQz(cMc?>T-JOu$S_4gJXX!QWU1XsSRk($Cbh+zB~2h9)3<$7mjKU*LQjU%R8Uw z>$EaiL`sml8h-Yt>IYXd%Nw+=W=j_VCkO&`t9}{*Y2<;x%5Ty8`{RVdOGYmI@t!%E zBhC_Dk@3z9VTJ7tge`>}^jwu}f$GDGWZ^AZ#B~HsiAPLTi`3~ATxZ{U{S7s^db)gQ z5;*85(0?EINCdbW-Tp~q5F79u3FsOADUQuu{F$iN?ep>q|H%iszZFB~fJ{)#$>4!lgd3tTzKDU&PJ}^dYWSCOrA2cy8p`EyR#Lls=BMyx`!4JemJyr@&3*mfEZHUnAjs z5kir!!Kdm&n~Z*@W~yJr4%w3N?ZJtWJRmu`TI=*+ak4(@Ks-b6CB7)nU0L3tm{ycK z?_+$~^@vesA`RSEgdZu$b#xrYP1ApG&58;~oj^MozhsgoY2Lai z-}RgcI7m`^OH*dqzLq5!A#Oj&;575zYWbm7vB7CnlKb3@?Rz&)u6ayeG~nkw%3krp z^zn7W;Z&k?xlsbE5y6a^ui%~YkjA3hC&`!}8B~U)IMbi;WLV8>^HWs9WAlLO`-v5h zw0;(UC3Y5n$rHNq2LryjyJn?%W>|tFL?msg08xE;+ge20lHgIx@EL=j>JI)AX`}i5 z@a--@A5JM7djIO(7k)G8t~cM^wb5e^aqff`-6{F$dWZ-IGS4FQWAE#Uq?K3^e&B*I_C>rvxAL@XyG1pE>t1XP*gsj0EQOA2 zhAWVXuW4J^Nqp&4ks;*X3^I$~hEYs?Q#_+Hf6!TuZAn|2ev`_}z;RRd8D~h4ryX=>DHU@Qple5GX zIL-rYb99fX_-^jY#qYC?e-WqXB0pDqj>^(|u3-(Us^G(hJOm?ou1)R817H45iJF8)!uq_-D_>jt6$F z`e~qM#`cW7NyuS=>>{$3Cn+@F6aS`)^nj~y@K70^_`pMd5-D5O8y@v^4hozrO$|oy zC5M2Fo^g2VUunNJ4_*3p^_h83_CcbGj%4`VgFBs$gz)aO;@C8?Myup~m8aKA1R+-+ zTNIt9CFoGsAAWa_*peN;nu#uWhhcDyGh}ebo*Ev{v~#M|FdRR+YH+SRgI1yu{eE4V zUGauP4|kO9`y_|-am*5Lr>t@vsi>=&_a&zb4701H*eodt#p3AE)72hVDwhH|jS(r) z>pX;-dz2G$=xtCKay_%=R{t{M6~rl^;K=q!zsM{X^Sw<<_qeeiS0?Z6p1#G7xqzPh&QmfawW zRFEeR?-n3VDxWqTXWdJ9^R$BvF4UT}n@5v0S|!M7OR<&s`WC66{>nYcX26_qVhF5lwa^8CT7dFE-R+KQx`jN*mv#$!xJcxz1U7+TY*(Y_WP3mx5tR=uHRJ zed+j|O|?{YxEJD8xbm@z?TW%&QjtpqyL%dL0?s>c9~Wu~S=1aIQ&~Im4)_-f?%+QW zs#{N0`8Z(pHm@n9)WhNnG+l)(DvKg5DuCF}fUcuO5W7ebA{ixm*sOQ>tQ>E#2=9AH z+T#Xj_rbS?yWzc%unDA5H7R4C6hk1z9|DVefzF_|LcB%ZO0s5Y^@sFW=!s7V-B;xa z5yl(immKp8mMD|9rKNS;{k!a^4P_B*#$c~g*Hw~du-vJl`8^jFM#fzgJY}=L&(Lfq z?K{z1rU!SQlKS4Vje_pC)%oUC*vy%&cq`2Ef zq(uswflK}9dn1)63m?&J5NnM4=qTvrL$PAK#i!K5S6^&>+8L|BEVCRCRlWHi!6A)09sH3 zV>xGMZt*tm-c2IA*RV0lQ%lr_2e2SJvE;FaeK{Y^I=7Uar&x1=(75NyYoc0`Tlu_~ zpR2|)wGYiq?ts@~K^Ky;|H+zqBSU2;9XQN_q^ag}yzO-SFl` zepqJ{U*b6Z74q$O>#QdUFeLXdFQ^Q+MB3%{5mIE*^LtK0oP3D`#R+P@1`_z1 zF;Xb=q(?G0|1f3Vt8%pv3^|6NZHD9{F?lu+8Di?{qWVR@eFvCNh%xSG`-@A$yS{@t z#Pwj~m{2$pX76!jjv0B3e||@WrMQ^=p}T8>cTyk1?`ET6?JBMsmBqorQU(H;4gpA2 zd_OtF>Pvb~fzt{X0O+u-7#$i9Qw^H1JaNb*U^az?l;6eFED3m{LpLtes^sO-Od#yN zkf1yMbXRGz$-ERBJVdgluZvO!)d9v6=e@~(u3dvmqCab^Ol;@N;1vt7V- zsBrq$Q4|Nd5{8js2mvKZG1^T&Un7l;a$JQSt91AclN^hEEY21kLKMY|TV90W+*?$J zBRcbreK5DbcySt;;bra}5TugAmw&f8XZzzSX=<+<;l=nUsQ$G-K~z~OlwIf5>$>}o zvcl)V%OE6hkeEbpN|az7-QH;|{r&vaOk0ypP~q z1E*)n$~AC|=K1Aw4a&)2sr04oz0sUvpmL55@S0TT)Z{6?yqfpaMD^0yq525OJu31KdKScHmSRS(gVO-zBG9km5dw;F1!^9%}EE` zC>=FYm%~Az?$8E2eSOq|(lPDq!h<`0IvDS-RTWI?RgI^L=r5|r_a6TGc)4M5LJ)@E zMxdN{j@oD*)@I!wJDLmyd?N%H@~HAy6-IBr1;lPdKLP(G<`~Ty+H;DSAh14g+lDOO z4&NXNfZ0F=_%z<{`d~g5jko*`D_13?ob}ouHcw`+-m^0tiUj)}2v_@KBx9kB={ibV z7R*>INIA)VK$-L2gEi2_tl0iL(qmvCXV$Ok>}<{KP9QTZ!plTBY%pdfPXb28iLw82 z)7-%pb-VflWcf>ic2C5z5QQ%dmjPrMhz=j27;!gCSU3U|GB4i?sh%O!WA&MCqxhn79$1>S z_$qpBbR1{r`-PK2F)P@}s>`$?6Q*BOoRqn0lMZjMF%xA7XL+f-SBYA0+7|E^g8 zLBw5bgTUn+OOPrxwiv@|+?{+XvUzT9gJE*5Mu8M)cG+$cNy1yO49M?8$Uo|nN8SF? z4ryH_B$sj=3eK2)`DZrQp0cSh7J1gPHj$#v_A*GOy|S@p&3b2h?hz3)o2mA37WWjw<5`O#=&7=`M{_aj9TH~w5g8{Ei6_Z?Fzu6m-S(_o#TO>@S3*;VN3 z8J5itXH(}MN6r#Pe_;>UJ>DYiaW){f97TDcvo;fbbPQ&LHOrhpa~J1bg&m0*>b$pL zcLM3VgnoUq{rCqYiV&sZ|JoqRQ)%W>So*aeRj02SSfl^b!8v0t!}MBWKM1J4&f57O zgVWsMDs)|eQ7Ohd0=G(^bcpuS{&Rs&;JMi1ioofn%mj6Fyl7V;pSmvyvU7S*>5& zpNn(@n{zop&<6JFG@!TT@lj>MByFDeqwQ00)iyu+ik7+Ft>~=W1!F?dI;zCX=OaI) zUEI@^$BxL*Y;XT5KdQkb{2xi@9nIGJ`0)&TuTpzcwQAKCrS@vkqGrvaR;g7asx?Z@ z)+TN3R*Tv*Y3;4}po)Z2d&de%e)oHRC;#0OCpmeZ``r6n@7McvNP4^*n$S6^>oO86 zIYd(Rn0%3QVLPl>I8o5iws#?@8yCEE_DcYty3y}sy}*6>hHcKxW>o{jZ+M*Pdotwz zfkYNQ9C-{O4;LAQUB1UroUC7>z3$sEpoQn-G@F(jO-ShlFCbEeX2$K~sm7SKNW2+% zJm~wU|8AW&M)(#I-wplu-lP!3ScyJG4Ukfb+`l<#NgoT!Zcy@P$3D7eS_&xA(0vOL zPOWn8+gnScSNn2JOY6Lkd7WaT(N3Gb7yN7kvBH~;MI2BUKdsp#m;iy_8`RVS@S4l* zlb_XMi6$;4bZDWp7$kE?`+SrczcMdBm$+|uy_=62{-}iyVlJ3)WqsPo(T%e zqo4S@HEVG5(>A9y4;KnBkwY~y>_;Zaoz@fcXZh8xnBU|z63##7yNg7Xr0(epjqMl- zROs%pTz_0MVJd+`qw0_xuj`bn&?{*&SvU1kxf8X9kyCVKK3!46piZOQc4CSI-3Cos z50~fH1 zr)!kJ-;j4E>Y-`9GYK~@8qHT;UWQ#L;gkna47}+k5)@H+r1>`)6XHr2ywUVAkDi$o z)*Y+@d3&a-1_UChA-QH19`}{bF9T!p>0tPZ^3Ne1iF55N%Js-oh*U9mKl^y$Fo(lA z3&Zpd*=S6lszmoWC>0EwgX*m}->iBgV*%>1Ypg1sxzb3W0qok$!+;|ewyOC|ca zBqoPKotxo0aQCyK^RprNE*{GM3i3}Fl}CCwH6$t*O}HeSBv{fk<2ID10w|u~lsD4K z*e*veIetg0K5OuVnEQV~33l=br@@^^;a@2{F?ZooydIFQnkAIBSiTn=3X6r2Z%Ur0 zsS#O!z=_J|q}`)!ln^^J8&wo9VP@Jxx)+l|8i2hyRzC3(5O3>>C8jSUm)E&*6&omR z3nUXEC z^3I>RWmhXX{V^5SOllj$oJlrC57TYbzUz~6@7*MoJ#>CQ5sj}|hl^Y_$3;3fMhK8t zQHwpFD*;&x?;&;B>lR9bx$M2|S7P28ptO-zcydNUHZeP|*K1`wOmlP6wNj`Bb9S z^@bEVBn7NTzUTZy#5={L+ZSece|TX;P#+63YL>lHh%WM^6T*xS%qS+BIEzbr&J(Rf z0QD%v-$=XKzJM{qek=cSjtdmiim!mb47%P&yq2wMp@JM1Nk0@z+mh5NRtXY(D0mGX zZ#xU4n=khsWPXGswN~hVgOeKW4yroGIBg>4^W*Tx*{;U!`Z>dytIDVaf?Ki#1wHe4 z*MSB~$n|x-F40j=Z-Qh@40ETw^RMW4_|KYBc)m0|`X7hPR?K8+jp3DgJ$+&S=G6BB^N;8pMChi*aj>p7Ta~E{ z5n$^!QPMHxi7=Dr{76yAV0upKX(6q%>V?@ z>vbmIQNO+{3<|Y+`Gs5x2=?FnGEF|7JFihBTe=C55-vt>6oo@McZ2x)U#%~u&7*|? zDF%pEJHb(nlIP#ENXO*#%e!a|`xH+J?a+r_&f+|#S7cE!uf3P<;X6}pD>|}?GPR)U z1A>gfruoKx6g`O>tx*f9PA9qif+Iy1?!u`?pkJc8OrU`xQXOrN0Z&i(tf=e!8BX#7 z2?|Ej)KW;*_Wpe{+SRTmwAD5)g>)97{`L<(!0E=}+zp>CV;Q=nCn-mB9T8R4n7-|_ z408sQa2GSis=DNfnBI?hN;0pJwBiI3;-A^<6sl_Zon~FVkYcq!VDC%JOG3&e{HRxc zTuQKIL|;L@vZ)`?VNAmG2z3L4b>E(CsfjRr^OfUTC(&5EO(;BY?R0fUr%l0<(U1vp zw0;F^9wm+(P>D_bd+fHIqd7y(&kAE8eOb^^z*eyR=q^wofNvSBsfue2s2&E}cEhWu zm1C~~tVRLo8Q&_RTc!Hntq6KEKpHVnV@o!VLp!GB4Xg1xi$lS;pbKC=2sfIgJ;jo* z{tDEyGxc>pR&x9seAD-5W7E-0keygd!%Ukj$L9Uz2BJpnq3Ajnvm zgBaJW=3g+N?;g;3{PyOl+jPwluF}GwNe=*mg`vHm_3G6*aENUXot?k9u*GAy13!$O>|H#*!S5 z0^U$;TtDy8U7cO&-$ecaL@eyJbf!?3^A$XCWqXYJIw1}50{uONs_<9rA4p)_l^TBE zl3eP70$p_vbg8_JY)5OVVHLEKndZdEn%?(xY?oB|SN$W_;>u>Cy8`+(CLz{tMstFKZL|es~Q?`$JQ4W4RT-OE0oU8-H2i|V@ zuva~C^DuYg-CHgKLuXp70ou>y+LPlo{||F)(9F)H>pPGzK-XY}<*OBSj(zw6xzrAFR`^!T(13oMmtMy0Z>#)yjh{m+h z^V#Kf=2}C}4%dP``79s)z6*Z9$s&xE;APtW9>{ktRP!ku&P((HenVO;Cp{>)Q**73 zk%QIrADj|!j!3(p#Bs!rwMm3D3{-DEFYU2!4b* zu8XFLI^b~5G!zPajdirJTHw@VSqM6dlCxGdW@2!eBzM3QPE{T3Zw4-u=L&g$HVRBA zeIWXOw%%g?N0jT}$Mc{N$`*PJa;JL9zdCj>P{%HI%aI{Gzth=I?7>!D2&5Wf0px7=H||h8y9&*lKYp`L z>Kqqp>_0>6_L279jY<8*tA<9O^h2u{0%-wP%m;{VMx3!shGOr#1s9EMf8dn20w^2m zxJm|a_+U$$sMYZn4hHIi#(?|*Nn@g?OVI}BvsK4M>RgCFbR76BFY6rhm6}}Zf^x0R zxBNLifJZ|Ci?kM?zpciGzA21PbS$`3!hHj(Zh4m?_4&V4p#3Rwz#=k$ zQV+p6o-38=Dq^MPsd4J$N16vLEyKZLauZ%okMsIvWFl@>e!_T@_BT7Ff_CnrwMY@C zQD+Ds`P%G3LjFRqjkG&IW3w5_ocXOn@O59d;#2fFB*@fkG~N1ZB(q-+obkm_4Af#X zmLder9+C6rMK~nJvr01tmT%4NB6lec_p2bif6{3 zVF3>Z`&4lhSN}L0Z~A^LWr_gH=25{K(XsZkT?L#sT!>*pO8AKl)Q;)xOwH({XGfPz z=gjAPwpq9BD{3DOX1tWS&kLJEI|8~M=s{waE0CDMhXod0X_3c}6;J&2H{={!I}vWu z$Vpgq)7_WG3LcP!gPztB{dM7>Ou1D!%XG96d$@4 zs74vNv-{Q5`gKOl=Mg$ePBDe2F=QH^SDx3>6Fo*%KbZaZ-1ZOpx-GzJSyXj8GIabP z#v#JDe$H(;6229QWO!N|#9&(9;MG)gYD#BHAIazSu&uPeGeV=2)lY9QXc7i&+^ ziqI`q&rgzg>4>9+Stsf;VSxhLo43Yezpqx%q88hWc6*Y>B73>6}0uH$R8eCz9vK7~`mC(63`!_YS*g*1oao1`2&%{iAQbgxt0lTY{byZC`4H#8YJ|FPj5 z1-O}xgo%sC0a zy%#4A0E7?6dFHLmUDG6VG=Dk&VYv1LnidO}J8AC0PQ6+xth`&8C|r*atOa+t96YXS zzSqa7$Ky_^c1Zcitk~_~oHsD1D_BJ0L%a$bXZ`nnhHk*LcEG6B@$=s3%Iv#*#d6A^ zo@X%^c9c#K(g#Rx#>Y|JE|e%tb>$%Ts2; z8=o)FO+^p|=xGi#iRMZh|9*#pH1|uc8*W6>#3Io!#O3r-d4y~0otXo8B?q7C6`&i_ z%|zF#= z)*jlqhDNrnqO^9T{B2TM$aZR+p?g~^yr7)l8g+O7z%rpT?37Bec35_k9XG@%w<)D_U_!4@!D9~adJ(H*U#Auq;gvtHNIfp0DfFx8x$UUb|6xE&g!`rc! za31mtTIT1`QW|G z58OsApibfb*)6?V%SydKt*E+Xc(WY24>i}*%r5!`&v(|7D0EZrH`K-Q9Fe=s1jszq zj?C3>ViY;?w#5{;r!8dY(GXrz6YRUg+rSyOx?PwOs zE#5LXr>)D^yV0Ph?3=E+yg7qqrMNb)e?itJ0 zy{h#mvB&CVhZxWQLnw*ybGt^5KouA}bn!NH@Hg-Wj@jrYfzLngzDyu79K*O*E)%-I z16BGCLXSK9NZ#HW7Oz|$P=JH;>NP6m%g&uE_WhkuE8~DpXyFq$ksfCLLdtla)fGXb zrb{l^@@YQ|CwpJS6@&W%<)Dt4p@W&n`X^dgroSrGzWp3KHXwb^!sgtr$1O= z)@i5NkbDWSmRL*QC(_D+WX4N_Qo2nViK;F|57x_TaIDm#_ELS);4e}V*;mf#pYDtC z`3i6|%QI_Pcuaj;_A;LdFhIRcM?|DY^uno%|LhvC@}5UJm#HmNjO0>IWEZg09(^5W zvl$qO9!QS$^aaV+73T~!SM3?fbV+U`e6^fkxz{k;vC#a4LV)FZo3eDhNmCh(ONQ+D zhCCfX9ZMVeum0s}CQ%~vqH(adhr>~!cX3`WFC8QgonQ(|iW0BA(;)SQa_1aofTwK@ zbJC<@0tIVqTGD7I16ao;fGgWy@g(xs4`_rj-NU~vez~PjxUK*y2T4LF)|R^^9@PX* zJ(%EY_&u!@0GVI|-=-8}BIkO;IFX(8XfHQ6%*sQ=g#OhV9r+)}&qnHc#)ba$m3#Em z`QMSbgH|MOal1m67&i7pI2>V>P*3V?0c=d}nX55|=1D6yfW}sE?k1jN17z2cH|)ls zAt8G$uN?gO04DjIG4aIWZsBPB^BXUNwKrD7-fF#FTcBxGanrs-$Jsw(QrzJ{%49RQ za(+8+5UtHf3VzCB+VWp3|78!$M2pIZCcnylfEj~RC6x%#js9jb`Fo56-;3my=?o@* znj6A>cY=4kE6-6Q2fDnKKf_#TZ-+<~K)L#!nIVUK1YGG3K>~jd@;_K9^bW{o=H!d$ z?IZ--r86X4Wd9dUIc#^|h4Ty~FF$kyk$ZkXH(_`ep=85%Ua-W*eapY}WUk?KhbO*e84eWE0??nNW;{?cG1X=f7K^pxb~(cvv6)u6n7 zlqE(f+Hi8AN^1Iz#=Lg$$0ZGrn*PEK#-7fLd$9+K1htUgsegma9@%>`iQgzrc$X{~ zhwxs3fto|Im+f!T+>3QEw0mKExKB64Blz@1@lcU;4o;jEe$TK9?kRH0AhOy-Q;Ux) z;)2~%P zW0RH(&(0Cb+6L{B#K#ndEAg<4{!fkyphiWN>y(OLUk}h}+xYZn^kBFQ=lL3T)!w{P zPF1#3qEEaNtx^WLad%xz3}l8`dsyRmTo*f1;+Lx(y7q1BCkJ98?B_GF?`kteZqkv` zzb;=_3JxPhk76i4=tFfIHf>KX+egfG1)?)QuJ1f6{1@{obJu7tBJ2L8_)5?CoQCgTgSb7$ zL<|k%PqBw>d;oOa-%!FE^6(oB*j^`MOQIM z6&(+~?H}Wvmyi2Nx1&*}6u17ZG}`NzrI)vqFINe_{`>Qc*EdPvSXVtIcamM4!<#6| zsr${l5h1k$dP^4fjZWEyQKL+2q$*H~))ilpS;0_-wyuZ}>f_2flh}O6rF6Hr`r@W1 zCSZ&5$7zg#&dNFF{kM#Fv(2B!8a(Gr1*TrQzj(eDuQR3buk#wqJ(U5Eq=oA>qqrR&n4QtB zp3llw|JZkzZ4F$~iInRM6e?a&>c>CTx{e_#ykqIpPk3`F- zBkEv_i`AGGPJ{-3fiBy{lhji0mZoD>XP1+pGj(rWx;Ja{+%fD%_Y0|pP4Iq-T6aMP zFChb+Lv`->n|@CWJoVqD&+FR?4PG5&FIVVZcw64TE2WpZk)3)sQ~05z&R@8c(6@P* zxU77gdwKB9gb$)(PDYJsVr302$lD@9=I%fKdQR}%aC^))ti1U!Z(of?VJ6cw$_Q1~ zvdgdIOjKlOYI>P%dd%NDS;}tw7cLZR>t+{WwJLs_@c3O?Zq+SHNLJ=``V!QS(FV5C zD(x@b+-v`?**jaaCuw;PEM|O$SeN9DK7|JHutrP+HmK&Z5zzc9g+G^ ziRDb;p!=r%BRi+Mn^w~H1P1(k1XgX~nn#jPZV}fom_5Xdm z)%&=K&E>28lfip&Ou-)01>ZvQnpHPLW2JfiES>X6}eZ zOf_Q?lX>(-bvQqf<}OWx&b2L@Tz+mddC);~{c4-OT4RHwG=zbS?H*n?yjZVCL-H=y(*L5;gTvuJLh?31f3K_vfd>gVXu!)Mos) zN3{nw{t~s??9O+z815!0-;K@vBpDU$(^T=w;KsrfTM9*Hc9LWz;c`LH!a+)B@+e25 zHl5{Pxc$R;Hlh5_mL=e}2cGvd|72gzE5BPG4H{qLrq_P|@S$GLR9Ej$-ui@eoP)nZ z;~U*od-<%Mc&5tg!;d>zC7_$Hn06I!JRL3XqyO}eOg*cdz5H`E#i!rWP71~L7ZdW2 zs`#yQ)D^KElnRP@`_w&?WMAo7PRJ=;EfuNmKfXnO?fR?#5I1i4IE;`_i|R{BPJb9K zrvBlxr(@TIS1s20+#1DoFZPW@4w;>cxrx+(bPtrF7vZhdvJmy6KFRoN=x2$J^qDDW zZuD#NFFtDW@G$E}G77Nn>s`Mdx0WOvsb}e~@9&%4^;H`S7}DJA4@!DZ^XYc5zN-!X759o zt_;y=icMXJv^JB*L}a9qT!-e3g1@sfjXoDx#tbJ?iW=h$o)&D;W?ba(Kvm? zU@NRL9Aob%C|c&>5iY_+)q96-l~6TNBRZK$1pg&hAbb{*%E%bVD(in|ggHROq(sy* zFVg?v2>$jN?x=iv}q-=CK5z5?{V(u$AXk$Y8;K<(4RgKwMFlY8r)FP9YT7t{ieDqfq*9;#i zu!opgUc}Rd>oo299hV<}^jP+ItbTf|ka+tS{ezz7RV2#St<#;2{lhmL8a%O{NBZwkga7_M{UuvbG z(H^+!5G8)Ubm%MjKstl0`lElqy-wV{o1i7xx3X&Q4y?~nnbNmfzt7!YtSr?1DucK$_M1ym`?t)a z$v`S_62Rt3&cZIHQYp@|$*k`{f>{d#XQO&7&d+=E`?u9 z939$StEATsbf1}@$!yEIAIqE@f{BaK$i|~t(mTyb0t+Bd_-d%x^85y*Hi|7 zS$45``u@!;dtsm6cY!0XV#MXkZfk&L+_9}(^a}dyZwp4R(|gFZb~q0UoaS=61iK84 z2A+4i()>(S$P=S`#SuBFKxvR%7`W=4mAStb5uMP{A7tLmT(O-Rl_!0dl@d~{U0NH0 z#g~r`+@2~fH!}N1n|dsl{XSfagUXhv(6CK|oe~{2mZ1uQV%(^E5OWqIl(&$7>b3q6 z^4oK>&f^b09dMW4jGM`#bG*qZ7~Gcdto@VKMadjR7SH!8~NC!>b>O zaZzJD1ajFZwG8tJs&T=?0CRNFkx$h#LQ`>*ZOz0ixTz9s)Iv5X7_hDOR~7d4{S&|L z=>bv7x_4Ko6(VCVj_gw#jWgfV?5Wqk;6q=!JoWTad*b97r`ep0NQe?ni^^!}$86_| zbE+q)hp1Hyxfl0|3Z*=b)+GPqX!xD~sz$$a1~!c|TX3+scIwu>VK!0`rtTKp&sTx? zU-r&V!|Qveb=5$kF)|*vr#cja*n~Rr%bx?B!#V6<)Q^Zq3(OC!(E@#R1Bu(3!RN4b zmvX(Pm31Og$Dw&4%;hE_NGvG##MHZ&_WNrCbnGt}Hf7YLPx7CU=p?{Hz;dF9)2d9G z@e0wZiBJA};`T9bHJOW8B(GjV&zbbqIC_-R>cn|NaY^eiaOjRpHu((Acw&r6TK&RU z?0wrn6@tuj9hrwE4`oAcX;h$bW{9*K_g-Tjvrr*SCCzNn)XxpYuCWdKJ{eS8zxDYj zT2-8z*Mtllgv8yONu1xmN8VzPx0m!W`7VV#%xpp!8`((Tk{*9;)X&FM;S(ef<;?bT zJT`pjW&;JF-q z9I>1`>%)MMC>L~oxK5Jua}j*}g-hYsU?FCImhPZf8Qxd*(D7uFWsN zkx$>zRAtk_yK8$0r`iFpPd=}?;P8{>dzQMHLEpW7Jj6t~Ib!3;WOs9ez!Uf>{Qd-J zXZUOHUqA2%<=o`%hri&aSKHmKJxhJC`iBX|Bss5XCu2F;YiG_T;$sAuB7WS6^m?@_ z#k1t6WzW~nQ&p`2ZPmRl+R~GMMShs9k>qMpX2kkG+ow3MFYV*jTBl$+JPhHR zFy~_@)ALlHzXK2845DX{pl$<8AH{%=E2N|-kG((9qYh4%!F!Mw=A>8H=a21q0(0x- zY6X32nQ*(%Kar}wX&m~zNg$mDFTsM%i4`{6Duh1ph%tgCalcZc6;Z+#CJ|T$Sy-ni zjY9bsk?Yx%HkgyZ;-zYtQmhqK%3m~*Gi-rkQzt2}Of8>e+{gm=U+fi7`cj;m-jwUp z`KtY|u%zYd-|81=t=eAA#+nIbZi;Vx^aF_^s=QC=IoL-xe^1Az2x}?h7hf&Ydh-~r z!m7t*N}3iM?|({{OlGWtHcV1D1BFul^PZ2XfU@^kKL!b9#zzITBaz5~VV85#h(Hft zZ9pAqEL?d_EYSKdN!%reW?=|4ufTGTf>Fi1-RoR6L-0qHdzZjo*pH4O!CLyA8>F)v zFfTWuC*E0!mtofJh3CflJ!!9q)EX_Ak`gqQjayH+0 z3RjDn{Bx_N(H6(QsH`nsf5EG-das|O?Q zs&n!i-3SElK={M!{qM6P#tVbI{h+Ni-IQqXK~lTst#^FK6=8Y$ihNI|@>tOp9eJmw z%Gj)t7T?s93A-(#!U6l-v|UHE3;#K&V_hZx+iDsQC_V(v5c_qW0%86`9DAFwNyk`7 z&||<4%YN{Y;)5R+>a`pnb5*JNP;;poU!_mtJ1Cx2iFd`B(o?)#l6Em6r$SQ$LHPj* zFGr$-Ri*^xV>o09{O7ZF7=p3zwQnQ4-(9)g=5F5zi`_L{CshKq?O%!go-~0`L=pH8 zTHGi3DA?4x`e?f#NPtJ=DQH0nZh@}cR2YfJV+B);f2 z$5d6NwEnAm!6d&5hWA7(C%cY*PVR>j4e^{*1b%=EL5~A&67Dgp?a8;bbqvsdwKWLG z71Es3Ug;4uki*u!n@QxQvV|h+8DV;bbqdBG!Ti`pzJEURcg2@FqlAhD4az~@)RH$B zo!KLdoR@h-+y)GL2{3}ONCOGm=V68~D0d_QS^YSga!jZBQ`*Uc-}ssvFU!D*=-iK@ ztUmJ)ZKxH$3IHS!p&gK>hAA;@(0A`vBa7$+#B#R*)(b#5z5=&+R^PK#EC<{0I&~$Q zF5Ey~p89->4pD*7T4KO1KWX1h^RsKv%L(vY3Zg_h(4i zmtbi!xE`?zfP5afCI|Dcr0(HPp}5li#W-SQApc=8Jm*v8Cy+K`f-8wlFcRA;m2DtK zw!PXP5Wqq{UVBT~UTo39ttcfl*fRCJFOxKs8x3Gb5ERte8feGuFUrg#`*^)pOMzS+ z$Q3JNWy@gcn>?W+4b~JJ_la9ah@s;Xz4&XgDF1ni*1h62z>O6-L z^53;udLVT#TZYjrl{vD{iLCkUwtfi*2H{FAN;)75%7U|&>%oOZUq}Fs??Ey5a$^}y z!EdKqGbD;A>x%@Ki1C+ieTL#NI2Z*AFgePyB8sYGL9$DSS7jzcGU5=S#=uL?i1n5Zajfuba zF8$=QQ$D}!nmKmA`a`Y-MP(8j?!t9FS;!jHd7G$tpxnUF2B6mqHfjcQ;}Jgs5}qe( zVxi!jdl$&e7;eiF2`UrEy+rV$#33xZyE?zSsoYEtHD30jj`j3TbLqYc8|LeFNi!77 zd=>Bu)X#DUXZvIcss#ah&tsB^@gpI%RJG$aD&$>EowrTGhPG0uvfzo}E&0%B%Kw=@ zjNnqio;Xm;7yH~R0_C}1E;0SKdkO`b#ZGk-Lly%Bcn>~?R0Ir-=;?lwAL=>laD8_a zdnG#z*XCZrS=@Va-pm-q1hO#3&Hf&+8EUuRIlW-Npr+Eg?M~N&G|e;sMi0F2pNSOX zWisS(&9fJWGa#B1)+!`~=M`TdP$d$U~Pbxl@Ta~_{P+Iv{FqRxFnKaQ3HrU^Dy0RRx-ov4Nz!X)#%8$ z7gqX!;an}MD%k5rKu$8Gb>M;(I1nvDp!N`8I{U$u;_weE9R?2*A+*-k>MXRn7WB#- zFa!38l2sx%;RBT|r8e@@y6bcr+WtAeF;3Ui#(V*_`p?gJr}hOc$qg3aHWN^=aZFjO z6eZDe87Ae-oOq%}-UUWp?|Q#!apk7)IoGC|E5LB4N!b*hbM+;0mSU(zm(eR4$js!& z1eXZa6=$!8fsT^f#zimW2xbut+xc}B^;#t51ImNOQdHhRn%#a14Cb}?^EEQSUE zcq`#g`Q;wZKs^f~9u`~3*5=uc=d9Jk8w{syIhz^QYB~3#oU2gELuvG@wFHz)>e9h> z{h)>5F8kI^yhB}o_s8}9RVPkObWT%9qv)bVtfOZo51vq#QzyY_T5gI{UnOeQQb%F& ze=|Xngrvf3pZe7X>L|z(6rOsQabkgd<+)>>{on-20;_pYRvN93>VoSZ!_9U{0I9Gf z7jRB1zHE37df!#(8|^`5>*sP z3xtd6l9Xn-+ zXB~{lz4ZLtOdi#eMZCLV!>r-CIcPUN-aG)8slzCZ+w(x|OG(lPv(mUff)c)`S|4dt zFteiL-wY-(ygs0POOjd4k|l8%=Y<1bKJD6?ibTqvM$=vewG$tGu}WK~e@6n&4TDGg zSx9THzUt1&UK!^~II;nuIs!MGuOI$Qxc#;Ne@xJi38SeMp12~ZR_sN-)uVCefL|Ev zp;>0sVgAw|1^~1yIeQ+XPGFi{E5yM+5hIWIw`qAm@bCAZrkx3wDq`%>1}AXDhIrc^cFg%&+x5#W{iD_7I`8Oqyq{VCWUy=;iL!oIp{cTv({ zGuhL?j#?hwTO-lUk)iD?*2DMLRM*yNYlE_`KrLjqIpqrv_7&a9Oq7XLf zZ_xuz%{h83q3EbyPoM8-tTvZ4_`9}7_-opb1sM(@TxD>2m)HfO%)h#Sp$gIpizFXEu=(iF%|jk?jtbEidP&A=D8c$-b4C;Q&L)LYD90Jus{-h z3F__MXN%%wQKQq6eUz=Ii9K&VU5d1;z zI!|uOI0hQ3kM9!2i$^Rk<*v$qeSz;fLJfq9ZDUliL9hR8)PSctFi#2eZ$^!&2uUUXs8%&A? zG~rnPB{x6A%#Z%f7+aaTU~Yo$rLz9nuMOGqu*DwOdt}g3-8K1)AlG{Jy>1fqLtSb5 z#mA|u40z!^(sc`uHui?(mC7wg+-zx)q^Ie#?a1HnZXz-1#;I#@0q@}%{E|;7mdo6J;E_zg5&8Bjf{x4tEU29Ci*)1fCDd`NgS{Gz5lpGPhyLdi1bCHK;&7Kb01zw=16 zC9qSZTb@kez-Hm)H%9lB&>_0xFc>U5O03H_`bM*L>9e-2)D!$K|D$5-sy!39lrHM~ zN9UFT;%AKBLD+e~f2O-$?X!`-JwpT>F>Jy)>+cyx8ZP+X#M6nhLlWE=H5hQCs0Gu! zH1<;z?iM}J`|vHnpU6#{i4ugR`@gidJ&(TRFyY<`+ed|VlgzG?JP2op{NJ^*rXn!n z@NyLV_#$H(Y5;>>ci?!Jf~Qy@mVU+@kfK3;^BF@c_qPfB8x;SQ{@(5a-Twr`FP)Up ze~;hXU&7ua6S?5tUQR39CDB8@4}V$oSbNK|PIVOJd_)b@8R!J(|6n6!Aw zZ`f{&N~A{xle$SsvDTDpT@>?LGcpjaf&I{f>P4VQG(`JMg=2qDeWttl7rbBlQH_~% z+oLWE;x2)d8k^Ru9n~CHc9cb=Cz3<=718;0v%8DpdQ7+O7N#@J>b&FDDgDTnr9>zG z{k%&eHGXMJ^7x79-8MZeid-5C&VsO zl3})0z1Mo}Qh7D>b?Qh$mvB#N_=or zhv`fv=DSlv%5$&QF?uLowKukIt!>hJqRw2yAHTjC`CP{~y10GPlU_A(5mrXkSB1i^ z+^$avJ9cJVKpeezVA|7xys|j@J|y{zv@^DD|8zE!ZMf4?M3nfp-V3wqe7VQQ4>J1I zaF@7yZO>fJ`_1PeHZz;=DPg6#bR<)fTILEdmCb`bkr@m3Dmr|r3*NtxnNl~PI=-Ad zTUfKK@4E0%q#8_%rbz<|r}e|&AsG{1`aMF+Qd90&&RJo0qrC^E(P;Lfl*wSl?}nCm zYg>0m_xD=s%t|)y9qj`Oy+T$0ZGN*|R&A*t*n4A5E5*`$quUFuwT%lqz+C(78Sc5z zcEWgZ)t)>eiKayb!w0qXkZm13&IhM}fS(^8H-Ru~8eqUzN9FUjL z70^L^_M?+W_8PjbUAUcY@;@t|^*dsP)-RxfW-8TqGJ=&!uD2yj>ql6JZ*eO#ZtMxw zOJ>S~{_YS}U9)~HN9VISWDSt<)|#-u!%0H*U#4ywUNHyu#QKLFz#+Nho)JEC*X+hs zmX|lo2}Ob=uer)lp^YVeovcdhd&E zS&!qrKBPDM?``dCazs(kMw=OO)er}L09+du07;IF~JyFUWsLrRJWh z$CqbxQSC*#cq1vu9v>5p7h|?CwpdOJHG0q5Gx~#Di6Cf8&-Ttnx)MP`l zpK2GQLn5oC=>QdjCDAJO)}g;WtjI$HKA{+-AZdt-Y3KFmmd6k6(PDo@42CVJM!?%> zH^6uUdGz#`ngFSgTQQOZeT01A!kOz)V(-lG$RG5gc>)Ehwj^h7AsLV%rClgh`sk5N zxs-fhfC@nK&?DY%_$<6c?kVUJ>^zc%DdmWZUsVV3PU!xz1r8oeEGjve!L5I;Imm!Ylw-`Vh;P&nqL-Jf`F&GA_ zQEePJ;!i-o#Heb$ytD#%7fP(^pAN1ambMQXdON2F7SBK+1jy*_9cx4WWy;orzd%z( zIE|D7q{XvfhHO22mti~oc##hnmkPAn<@67#4-&d@^TK6ekZ`Ii(AB_?2Yat&6Z>aQ z)aDuez+cd5Xj;g?buX`?3}Hb7{kK?>&;cZTjL(SU$6Qc_-Up)uPclLu;CYk8s!|4c zC?c%n;VU7fS1M2(O*m8o@^7Q7jcp9PX<@+kK5%l|G_CvK8W+M13;rSCWvcL!XoQsm z9>I*&Cn#X$vZ$=Am#vByf_N5rv-wL% zst1OdOseA!rWcN1gmGakwv;g-KtOR^Xfp@PM!RCf8qLS~`CPDh;8GW_{yBF2|>k5Tmo_d@Q~uYYQp{)d0_tZi)w- zBG~`F5wq`_7lo;S%7Mwmja33j7Ia?LUMy{hp`-Ek@c~8tDS=Ew^x;fi-~NQ);Q1A> z>kdXCS)uGL=tgSuUmv&XgVkqyS*MqB0R9UyV#w_4@VRvs{m&534;cpn#z+;_$!F=T zQQ>{Ze?54g@t%&yB7v-KM`bh`o-FQ!%qg&2mx*=?tP5 z&P6DK6dz)i3N+aPvy|eBtONWo7q(M}kXu$xP0dE~-_S5=uvJyB>8Fn8bqT(%U3k?& z76aj{H)^}{)(@&6wK<2YlqcnEpgRN{+OE2<-};e~o)KvK6A|RX8&2Kv_}pE%y>P@r zp7(hltzDGkkp_0x%CCi zLi0jY6Y7k;T-|bsYPF^pVfGuKhT+mNI?+#~r6Cstn*o#(B>w&3&WKsxNiqLEF3b__ z>jWwp%=AoV`OV3o!-b|f;5i3}cOTUMq`^tG`)Ap&!jtw%6t+hJ#o$1_E6w%8$NLHl z)!-U{L$YaD0uPTnQg75CWV_G*b{Hfe({f2~ncY$y$*dHSsj&s3bh(g+fRqRD`Y_(K zzW!A)FT*H&<4Z}5ZI(sYWc^(@y|#SFQ#39UmG^HaeKS#0 zB(F}s)c7G&ooJ(~mQdTj-T6(4c^bqAsWF5XJ;$o)ehX3);pnK|54=ur0>iBE#oH?C#cp0j(i18(C$=99 zo7?Hh8~DFuu6&z%v=O$9>P9N{@y+zwtYa>#E3%`Rnu+wlsx!1;gk(k}%igT6cz-fG zn0D+`c&28yZ1OuxK4f`eDFqrMF^Bg`Iy#rVv`k``h4j=xh)7-LG!Mz zyV1T>;!)?_t1AzFO2Y$EFy&L({c0Jh%L%s@2JOC=c?z~VlR5eKLH#eCjc;RGWhZ5I z{54$r)u=F5HA~jaJC1gZm{eHe#_&F7J*JDSSpH(to!iTtE+4}8Y-6v>X$~TXsX4!* zK&c7+EQ`6bX$!sIPkN4}uGPV%xF0wctvb{Tgq7YNI3%e8USDS)HJ(YpbE0+L(L?&5 zQHnf(x0XHqoxKWJia9bgLOoR+Eqj;jkputmf*+;6-wYY$Rak{^(Mj`8nANwfs{1so z^81x{;mJ4{`9p+VYrA}zXl?2@-B!mkI;c^m5E!rWRL_~tZ!m`}?MLkq|kbg&lu z4b6r|I8bM2H}ZS=zgy&>_MOZBqv$NWqI&u`yg{dQcXxMpgFiw*x*MdMT^b~$yBiTn zL6BHFB_)&;mJ$#Il-_;!{R8f~b7ttM zo;!vD1H%gFeki^}6d5b56+%ur67sI-N;5b2&kY~72hA@M4IiMJJEY6u+iVC@l$0}c z3Vz@yr=;=N&tIp6u;eQ6ZVq5t|w4zfd(W{>kG14Wy65$|}vV){m_axcT zQh*ai#F7L?hXTw{5^V&&tP9-z_yo^~v)E3&p#p*zg7ZXjmpPREuMP7mFj^Nv<+$g8 z+M=LbaWIt!ejx?|s<_v+MDjHxB6oXasH+1Tn18xv^z3%*`bdT_oC>P7OLa-Oi&;0& zOem9KH$b-J6G>ONKUj@)u}zD=F3@)QVk`G`mEhGSgbL--7TA?iiF7Cvvpbp{D_zFs zBDF;Y>OA?N@?C5L8qX&&?M1z|-j7SDQAv`U)iQ;E2or z;NO!jUPkzx7=!gCgc|iYMYiewQk<8(F7MGQ{}l4uVi#)af5MY>dM|po@k-SXOcl8g zT#&uYdgvZC*5D_Z5i9b&Un4m{N?w}5>$U2qzOl3Dk9{k5I6ZDeY04lVBpSVtE&v}o z78T_Bt2e;q4S*NW0!5LB=1m<0ce-<;&tMCzm)-q z1VlOi>eI4esf8EGwbk{3S9aJ?3nk$$naI1g1}}Z79MuKPz@)Py0Fav`r@Erepi1*?YC&{a+jo;W=U#LL z=&x$MLCz^*vX3wRh@*KJwG}Nv5o9N_I8J7=tPyUPS_YChUbfGc*CXIVY-L|0i#DV* z8Bb!$ZJOR1^Uy4?l=!&YfKj|=Ouw;*7-?#DN0Yq~1gadHT4F7Hh6WMu!-R1mM!J;d zb9~vi!E^&20fukW3OSxTZu%Ym1HX(b`af&W+rcV$r<;XfQ$i|=){oQJv9C88PtHrC z7W!E$y80_s;a(YMW=Eiz(E)H~;^VzXbnb7}g!~wrH8kh%b)ro-1^`0M71GkcVm{*b zvQ@{r6Kp-NA*9BctQ}@3&?@@)f;jx_5U<$k`YVf}2}Mf&Lb&h6MaiAhZsoKAK=)7m zzbDO3*k$#2X_&4IK7h|QdA{7?jQTBj|AS5Z#h|hlu5Oer#O0Z0(2$=Xv^vYf$dW=K zP;qd1hiw%>_D`4A!}B`)!<7OCZ=3(LJugg@tuDX-$c_wjV?!K+cNwr#8%9=-r5Ry=tqoeem`DeTNr+!4)_p(teJ3l3tK(_) zb~IRgCmsV|4AEX|{=#2z3&)}!Hf)pl+oSx+6X1XLhm$nqKW3lX!|w~(=g`TldbpUM~Syfx(+)mM+g7mOC1vj}K$^GTa(`Berh}-#igYCobXr z14+-6i9=Wdc;U@2{n-!`!nU54oeXdrwa;cpO7G&co@94T#|IDv43xr|U7C8=a7a9F zL)On-lve}Go)ZYvQ0l^yd#E%$--2C(3e{8td5m=v3Z zD^~`Gq{rX6J{zK!Ks75n29`}{ zec|24xf4*^AWg_%_Dh`n7oVSytPe=Uco#5vzUi7~k^cxliw?j9h;)+Y*8w`@Z$UzP zwU&x0dPHEt;p1}WyBDq3psat`aQ-x@fip#@OU*BmrAkst;(fJ@8ID)S z>f^JNwFD+$%NJssh|}MC;6j--n%ckK#9D0F8qMtHXh1zvIo8JecE3Q0!ZQoDm-;Nz zbh;SKD<4j zQzaaV+Qlr%$J8oP?6XC7(k6RC+pxmzpr^)-d7KeyT`xn;tHEDJv+D}~_+bgJr$<#kWXE_1FukS`38BK zsJ>)rR6HDhmmw_cg;Bv~8i;}S$mbXx2a1LpvIl_cLT`l07p5fZc;Kbp8Qcadm=!=#*~czSHKXwQO)(YY`B zF$l{n=Htc)$Tf?AEmwB>wsT6+WjVVd`C_%A-f4UcuUuz%6ji*7I>%lVHeQuS4kvj` zC-G={BELe>hr0|#&Fyrm_iy8YpOA`a=KL13MN|(=|L49Eix>Xv|9wz>D%$no;JvOPv?pLR z^-swi+RL;BH=L+Z=GGV1<5@`^dFx7OBJ2dd;~wum9FrvTo3Ys{W9Z=>Gvxn}n!Mj+ zn9c=MCc!9QHdeXNITyJBO};+;H_2hEP?QRWPQsjqi&^?{w2E znk6ZRfyBBYzVh{5s#l*ixwc}d$|hmAJrc`fN% zJJbh#3oYpu~=cQq1f)U{7nz~{;K5XVdZ4K zldYvgn~90Ee&s9>T|^?I=%pvrG_xm^JGQc*Sn=1olth)%?D<$bL)+8)KeDz+V`hM~ z=E|$jBx2a+jC*<553vT`RD&O8QFq-jXF=>ynv84O7^LmGO{zVu(xf*rX7QTRkNcXs zT3pssYP2f}uR=QA6d@{yhhv*HmfPk?Dv=a((NBdSOlHbIoX4i&;;x3N@kp}FS2N$) zqBM+P-ft76Uf{PO*#9*mXTM2&Fmc1vci($y>_`EEUypjCkitJ5L8#xFCyrTh0<*kO^V&tc4{#eXC@Q1tYa9D zZBp8sPTndS4;ud<6yY zP+Z<|YNQKWmREI$Q23}Ey$?RlzP9M^{3(pWolW9rXA3-LPk-TtV zLU#O<@nrMcmYYh(P)=p5Z__+W!ai@B7hEo~l1*qeU0B~bVK9>TJN&LkPkh^IU9Q7v zNIThzJ@cfd=G}Yubs(?6lfuaEbH669jZjEyk8*U#I~R2t6t>@@^<&@A#j>guz$ZgIegc*V8jgl$R57!yPy@W>nV~!D{=~t{< zbY0PEY0KE!XA?eeS>JwjPB4sAW`9wsz9~A|UWLw~H!X~mdcivDRQvaqaAA1cW^iSb zBik`aUMav;aj@&b6{u)*SM~MA0*zTjc48>}}CQ zF$LBJ-@^M##ub2_e{^`ZqeCFCLNvf3pQQP1x{;0<)xitmRK=sbe@UTS5dhsto?>iQ z=k`nt2RH3cL6L)*pXsFzY81X%TLARg3)~GYaT<7&@X@AlP=+xd2ci<9#sJ(-_l{I}a`^Y2e3y~FB^NYi)0Dx?n{uedL5R$6~h`ETH~R9_;%f^pmN_=;dPkcvcRF^h(?S zR2Wut^}}lvfc0b6e|GAgTY0}UsIuEE0<(;h+IX9AnzeKpEjbVE*i@Ir7yixzih+7h zR)XXq0C#8AQri@aX7m|(FhB>c{WGh9+frG4skBBm_$oW&t}Fyfo#7TVqKJ5XP(LxG zD(C%4?R}3AK$M;SPLS)lt_;4)i-rB1KOGsbhp*St6Ms0~(7Zxw22)kv_?< zn!1WrqhFqv*jo~xdn3uW;e1_#xL2DDYkEUDgL%`<)rWn%S~9LVYg-Oe{ zS98s3zLpZzKJ%q?+_;=5$eivf>?oZvEeO>-`}xm_il#ph*B^ouo{N!|csQSRIS@aKQ6iRT}r}yM4^fL!(FSwnD!Z=i5fhF=+ohryd z$ji^U;54DQ6qHX(?fk-vyl+0jdoGuwghM=o9n~T1?u-q>w0Am&Z&dgj7iwpdf2uQb zr4HQxN#4xMy{m8i_AG%M*Zn}U9xW3NvTT*ZP*JB@)fW~sKBrSVICG!jEZUr8qn$)4wI;g$EPv2F0(!|X1v?V zHgqG8@f!N`OVHM%FR$%gRr|=%Odg0czXPFNZYrC3#^~ML#4n46q=4yA5llsjh|s2h z@tbF5I~vYGuYn8OJpJO1WNE()trV_I9}H>Zg7m3*se8Mw0b`$Um)-N`uhEh7$FMkN z)?I``9FUz_0W$QEz4(ee&|B;fJw`aGuvfovuCjTVvSLls`KEJx1s z3@amEblD`X-}G{B09}r3*@EeSYD78+qL0n=tJkdqKk2I%)#Q)hVYsu3-a|H)-w0F;wM8G(O3%tMMV03O;($;}_;(i<>QSwiJ@+ zS+tV*V_XMyMyb`DsTIq4D*D$R#UO^|1WOUW4p{>YfaDRYST|ytbeHl@o!*R}iGs%d zxuj<^b447WP^WaJ9pY_W{m=0TBnd|U3?hKa?WR(`Cky3cc2Y$;nAl3#-M^5m!(UCj zWmDHZX`=-_{toLe>BxMCHlmmCVY_xA66Th*HjN|!_5R={Q;GfNL2%uRf;*&kE=qcG z5idtlg^*}>I1%Xj(V|Ct;C9h`ewDy6yk`TQWzH5(g(Fg#nJ+))%DfqKc3BJR@2ke}H(L5P!NtM$7{`X&JGPWD1+O&D{X2p}``z0Z z!aLnwV^2qh13(KzZ)`_qKb~Z6bj2^_)O;$;afNttNxoQkH>JRLPaXE!ac5uk)L|cz zc#GcCa-7UI?G&jQR4@{Juzge#T>5X!TU8ifMEK_>M9Ji>s{+w6c<|;#ysUXRrZ@ zeFa_>FfJd<&BNSMwUm-?Jj572m@x8IuVqPr(C-KFH9DU*#)*Meugm57EpLiC{(D!T z+1CE!Q(Sd=8eO_SA1NbHKg8)m$l=5ipo?S`PW$s-71_vX%Rj|{dnC-gJCJ_e&#d#V z0Mw?M0Q?;I$?DqgKpANGV*xxc$ai}6Zb&g-f;6hbEhFDPBU~krRg-h|U@K_{N>V*| zZt$5uzCT04kE-84eVP9bGR?H?GP*keYZzxWh0$N4uM6J`CGNc}X;@d#wqLzYrO>r9 zD`?_9$6LuX3PnFMFT-E98NRes6v$Fwhm<|pyo};a`eeX4r^vn^5uAh6M+CWMf5>t= zvDU$FM37_15|#fRmViFsTpqaW-0TR0jznQ?q%S@^poIxw8t2uFB1tNax@}Y~J(~yC z&{~&?pC4hBPN~LKFb|O4*vQxx+i~h<0vpW*#30ki!|c$ZIr{twsT3X zn7Lu-AZ~E4?=NK<`c?d^MFdbh`tB#`$1uWr&?|HVZHht3i4dU+fcNhATL66_`ivVu zJQ_?@SxtwZcQ1t|j*{4Pc`4GYA2~Ie+JEv_s4ZDAMZEVRw0_;K$FfcY;5F$k?LigD zqZ0&_e5{6~u--(XFZ}o;wG*%dxWidLh#WhzAJW;#xC;BCx7)N$uyv5z`~}7)0}?%x zIQ@R;;|-p0{_Vfr`;hDIrr+s&Ao9A!3gu)sLR^qWyMjjv~+|?{Z6>jkrIg9&SsN%qg<4x_ysCzN8iPEt6Gd<`7c=6D>rTCUgo&h%`a zshM^O(W;-ILY0oA3>%)uX}WS{^CgGIb(6vIy!Y+sec&jtJavMk$QE3+u&!oE_~f&{ z-xUQ)ltF&yEU=51*OMs7Y0?y>O>=Wq01u+rh4LJPDh6+7i1vI0g$`rc^)m#B=+r4R zbmp}vNv_~yCD)dednUQ_K~WV=o`insaYctHL0?!(MM<^e4P+IUkC;mEW&NglRw!W6 zg925AJ{V0H)Kuk=rFY~yne3yfP~|VGkcr%IKk`FDBdy;5_YX05dk=a7J>f*~{}~*% zYb2rduc&Vl-gfH3M#(RHcB_F6v5(aA{t}LNhLbhni37Koy9zLw-QvZImvg1{rEo>G z5v#}2kC#XaTNSz_RrU|xLaeKpEj2^Qt}rfXP)d*%yL*QHI2O5;GVP5Grpx+Ch!SM( znc*lEKvQM^dr)Ow4@FQ&*czQ&OzFnv*=Wa=KRx3-=|33cATV|YnZ#&6InK@BPivnc`v_fvj<~tUJDfh0-vvQCg-j?>O<4ab z-#roZGS@4mdvWIQ2k zRM?U+;~SC#H4*}_L&0a0xtxX+(9s9#p7nFS=A+uB@+^qMrgCZIqRZLwaWFtOCW;1E z_;4e`SX+Yuio`&?1fV_UpFtx`bc4rPKVA1jz)@6O3PfIt%kQW?adQy2uxRoJLwhq5 zoZ!{M(wFg`&lbB)(}UH6QjP%PlU_;IuxE@26nMgf)M3tP|MY%jeg5XQwPvsp58Dv0 zWBoDi0i~7bVnrt3DR#-{AIY2!VxyR!_p2MLN5ZW2#-r%{XZWZ-Bito^?=r7^u9mYZH7iGn z=unBNoC+HfQwB!55bm)BBU_`JZe5hx>d(Zm>Kz1F4+FO1MjT>z;@h~FEWGn<%^!Y< z6rf~*U~B^~qMPl2lUT2XC~oh!Cq5WPqD8UX8d{Z4nd9Jw^Ie?(ARhoi{B9-Gml)pA zuoRsQ+?8R94(bD@0cF#?*#$q=Tt+fd7Qa565iASXnJ)fbr?}a|K+!cQ2}cXd31&)e z+l^k`-T;Ihm2O>kgPEHD{u@qgI@lH*LinVAYE*{`3h$d1IM<$fqCr6i45)AWnjNu` zBd9X}v4&rPNKu{;Y|B5IuY1AOBJ`w)$nVcj4OOPA$;v>|9cpg-Bjs+5w4gP7_Q|E` zY6x0rIBHTN@mZaz7#yjZ!sxG8h>k6W4=UrOxJd}8A?4p9tn-gAW**F{tY4nyvT3JsZ?Dn~#u++ja@VC{Nt8=SRbkE#|PEOD|ExH%EFWZi0p;|7SLg z;}5Ek@vj|Gn7AeUh5No&NCf+X%2cS^e1(6%I2S}>S^<8itlBHByL4fQpo_}=!$EQ| z4_`OS-`-DOP@q<5{^yl#6_#`fL2aPOgOPNXJpt7ss$0Bk9&Cv$)_Y&DpRgkU;!u@A z1036XUWd+Qr|?{!9#0Y|N_#c3gezrPoV4aY#;#Ex2|?+a%GH;AgtGr?TebEtJ53I} z1R&A=c~>z+)!$0_0Jdr_E`w_R#u>Yl)b)RC?BySMrf5FOCk~@|F6vK|EK5fiz$`#_ z`q%&_OgqNUf%=P&peO7;^lh+B;v(S^%F*%djs8vG@gJE>nM+^R=fa(sn)I2niunt) zjK!yU^zM(}-|UIcS01HZE!SR}oj(8_$z+D zRrxeP)i5aY8z|XZllTm!Dd#_mv{`Au_Pc&F@DKaf4db8sdHHQxM&I z2tXGA+*&j0&9tb67cW78o*<$(*t~*7>x{T1_?v(^?;mkx}~gqT~zXU1X0H#OkfWPZ@z1viy@h_+60HIwa1z?eK9yzQfVYPXad#6pd9b z$KA5g$F-Lj>u@yUx&3i49b$FW5)D8Sx&_F8yB1(kU=kd|jbcxzx?{;JjJ2cx%#ijc zs5I0z>o{;cn`h#OtjLwo&vQkQl6C9c)qblkEi{R9CD=x3T_NE|f1mW)6%-9X5wc$l zJSKa?x@FqEXhVk3yNan2-%kR2MN!@-hxAOy;!X=aXGC^|S*2Q1wdRO%EZLBS$~!hW zM9o~E_R;(0t!zSX7}`y`!)G2cZUmN~PR3B9u5z30VI=#3OJ2Tr-O|w!T!QW%t~mX| zA0{iPcJl@mLI=yd(cZ7~p<=B%3-5`uc-(jz^ z7(NJY1Wy~p=Bs@D^R}>rU=me}GVmb7g8>^;O7vSjQb!p0J*S40yW~LD-iOj4U7(>o z{(=z9W4*tZ#WGWoaW4A;XSg~DP@XUA^( z>NhXRfI2;Vb)&@N?YCi3j4K7)28;m(x_VF)Bx2IT$otPAD&V&Y#H{xZlyI<@1E~_k zn(C|BS1LEiX}FT-ial*Zq$8k}>vp3<@BgJRDirZKAOLzQa50Z$mto!vBN=E?4hNZ& zgq%9`Fw`?2+3;ToCKd=+FcrIOt5G#P`~m)w+!;mK(J!z z0mkIwaG|46la4?2HNA?VOtk1}R5+gd`dA=ySEs|v8$Khjw$r!%EHa6yun^-HRQ-gv zY=BUU!d<(adiQ6iYzJwL&=`iCs`Vry45PSJ(s(^TN|0~$(M6Xa2vyM{9&~nNiLQL) z3Z^P5B76`N_K{f|@S|6V;-`PO#|ZlSwuFM>KOB^LkgCbj~As=|Ex>$%1iu z+1Ba(zbloLi(5oSdf^jf7*eg!6aI{H0AcrTo<9HV=J|`De{S1kUFFj+*}93aL7+rH ze@+HV(J~plx_c9!@S}6-7j=*rNEYDn=Hua)JBFkyySS(ASY-4i^5WQM)Y)ktdD{)O0GY4@mw*uug-jQ zE_+%&0{H5ZN*Y^+&FM4`LB!YUO?)_nfHCN8p1NO zkL70PD`b*va~#R9RuO#ku=HQFF1&uf!%g&=Y+z8V#6_6tb2!7j&wn`7Syla6ZqV*O zEz@6VC{Xqi+~0?g+{c|P(k(Won((7X%+d0MO)x4b*CP5`q^GZyVT8+U`}S94!RXpa z?Cl-mc4-GXy=I@s=~Q@R|5=eb2Tudn3`F0mY2g&}UCiS*xto4Y#|*oi5e!V;$UAY) z?I=U5PRw|@qlD*^qW0e0-YkFrj37Y$_4kki@bCGyVOI|CO{#t?d4(RQNc;l3O!ty{ zZyC?kc01W>3UCq>82zcFZ^lsLq_#NLqm=u@@RI0tHlG1niM5k}L|`^e}lvfEgrwhlEr_Q_Pmi zsfBSFmoMPQ=2Ik(wM>h{KMgJ#0dp|b=qegirstAJ2 zoOTX1bLK9u>2B!(DD#Eqa_aq)gYdr2qO(+|Jcu010V~?63+D3*@~+Vh%EqS$uRm7db3(jS7N-Pax$%4uNO=m)9tsVh14PYBw1y|QdEL{)}jQcVJm7fHJ1vdGB zcAz)x2m`<=3aVfdWKyehn7%aK-2i|S2C9G`P;LaVuDj1z3!kK>GR0}a* zMN8=u23yI?DB|BqyNb}iF6kH>Zngo^14M2r=0d)Wwq5SAKQN%AIV>nyNfp)ndK5Y} zjQveS%F`_9Gy)+iw?J^>1GK@o5^?YUgCq)fY&!cPa}=G}EOU>fHN!k)MFtEl)6109 zi@b(_ZK*>*vfx+i%%rVz_qi?C_?jp@p9_Vout5!$6J?Lw;?;e%Z!n%Cu|5EJPy!lT zvhH$kf91Zr*L-&*fPGYGyU-=n+cOt@XjvSL7h!Ur z>=lS^hK52^zyjZEsD+U~sTodrCDVS325g3-46*b;k*rwx15WphGNN*J`~_UwQvHDV z$itMg5`*Z{pO!D!AQd{*yc7}AkI1bbYOSUGkoTvpgvdzpW1KT7xgeJ<*M;}=#uuaJS ziw><&x|gGwLYnUqT^60z7b*uFi5{;1-M}PJA3##Jg8D$#OMuT!{a+3G9MStwY}wi_Gv-+}#cMoQ zagDus;#{PVm8ndS!vwivVgwpWbMk+Ph>C6j!yvr?I4ODIJzB4I9}51kgz3*6Fn;?^ z=Z_kN2SvG1F9QJ}zh{fC zj^HTbVP5C7DWJTBKz>iDoPVKOLP9?@h=ZYTp0ri}#?0lCfk-X@kWFVNpmPBhVswW+ zk{&K7p#6&cWvs|CNXUxiOYR?&mf}2*gZE~_f*y!Ma&F{b3wwIM60Y*v&NzyE4M)<1 zRgf$7kQ&F%cI53WULlD?rk+Y`7dFeyK?{Lys|Jb>sDr1&!+1l*UPnW&Z>PQgCy`{? zaP#1EbQ)a;%Nr~Q6H*x{-Uc;ev0@@ulGc~cr27vyg$QHy5H>&%T_Jqmpi(5Wwzcxe01WvCQfov8P`e)g0xx99&K8 zgBn$4j~4=z#dwueMul%?)OTWpF*e3=`1mDJ;Bn;8sL}0Tp#dKSS}8CT*yo5-GcGeq zOEEE->0d_4b=;DPFwNPf3KOJqdV%12BY>4mDc^@-y%ZUQ7-zHdEF$`g-on96txyWI zLN5avJU?Z2In*mVJn zn?wH*0_vkRV7R|`MEzm>SdDEg%uaI0x`C5f94)Sq&d0ooIj9t|5?|LaB?uOZ1MGni zP9eGR3)0?r>9Dae93pY+UUV9uV)g{pi090+c^6KBC#WpC+mc6{%q~#=oFqH0>@!M@ zf+8(Rsx&w|dU?+B`wpMp1x+OuF#-lLUN!LXx!a}5kzqGq^!!kyh?2Va!!=ZWVq-)l z5~Q#X{}uyj3n@pv)`V(XaPG4Et1jZ873g<|(gc_IC%%t}Cx32a>Dj8?9o#^=pm@aS z_gTaso>YvmxQhO)Ctks)NTp}DB|wdz>rLG5pFm0l)3NYk7n0PJ9Iq#b{A&;Xk|#nu zn5e1DFN>w4lI7cgC4Za<@IM`ZAO;nhP0D9DEogUBi)hw-E9NhD!8ulXpDMbIgV^;=Vu66RZZ#Q!zCW=Gh9W>Bc z!DCi_B8PVJvD^<&e+c3NC@1beOM0n}hex?^<|`ca=ZOl%?n&rG+$*v}3;8<_hBH6_ zcBoTwWWAGXhX_C%ZC1}$vVvxB5gE5qO~ve7`X!sF29|o9NsSUhlumVE!h_0(8Hfz5 z)ZgUaqLW@J=wrgAn)5$Zu$8bJDnIc9pEU1nDiP8|1ukGGRBd%7VbHX@qaDCFDvvD! z@9vx_v-PX{S^U+`ha-}4eG~2vDpOGAfT=d7iK3)N(DeuFkH8e(K@XAO+*ySR#BWaI z8d?|!JcPiE9vvShe?fBzcZuKL=O)BjbM*652HzD-hf5PftQVshD98`vGUQvw6H20= zH0H2m(V`rGtyQc&>^j`H$5m^jpP`pLUX~24@doo|d9N0g(3~tJbM|U;u!SFGDe(P@ z%ts|q=Ug_Xr#1Yaz5ps@Bl6+|n5r3HVupt~l{}uPpi;C3Swqo~znEiz#ZSBchfdfo zeRf^g2bq!WGLJ4W#y!~aJJhrx3KeBiLY?)fo zP^Yc)?a?jln9pa=d@<5|?|;HrFF|*hml$TwlN7i&CQi_xX**4gbOcT-0ZO1vp<^H> z-=%T$@g=T^y_wSBOxa-0v(Hr72A7Ip${LA+s08?m^bSS!n46JQ7a9_Qmjd-EXN;q? zADW0N%8K2W!@1xn!Y1?kE%tQkiKQBW)gdo7y*rx=2nG0{^;LmsQfAQ*ZEwpg7o{s8 zr8f%fQ9s%`c;?>fPVkHsyeIW9?a55<#m*EYPC*F250tm|ElDyKyUG5`$ zWIj~I>JM^cf?^zX%;KDae+~?pkrcQ@dQC^w@Z&nIbrE4%V5k;RjX`F;@B`A|fG$;a+*{Yw`)Q&c}(ZnD54<^D^dh7>84o531J zb?Y_a&FG{QN^W=i12TEu+ev!qIuQ_4Cz@GPbEjP+iqWH!X&`YI6x}wr`ZET@nsl%F zPQO}l zO1fsU-134)A1A+=vT@?E1jSM>We-@%^t@;*-Uq=@+dtMC-avv#BK~6rFu4{|biORE zO3_)%{%T&Uh(p2Sru0-k3@$5U=#aQw1joX?x51z{0 z49CNY3^^`j6it5M@@W)F)fK5nBzInjGh*TwW*%mUao>$T9%b^|{4m%ZV^(FhL!&XH z9CNA1G}mQ3O9LykdKFB?B9?*FVnFgn0KZ^~VA%P4cC?GP0H0@IED4a6N_wMgHn7K9 zHgutL=YieyZ=xb-i&2IjM2o?6aAg58y zQF~~mnjOG7nVveT0n=`EpD7z2XrW+$DpdXx5@_$=4m$Zs+Ce}YT1!BEL@UoXGgYfM%vxPkP0roptLWSF;7-g~fa&yU#jF`8W8(4^b zWf6D~4HNzk=8d4UpF|^xg@X;$mx#$XRoP^_S zuJ3hn{&A{G&;5D3zZ9B_SNPw|k^ZD}uwwnRD)-(p_a53`)U{Fv9g=`4vE{I!cYab_ ztd3|tSvoY>l_Xws0{-Zpqp&e-d0D}{&sm>_$V;X0!a90jh8!joAZDqaP2R$5Sbd0p zzf!BJED#SVD$HV3^uGEAw_gBY$|i5*yYV{=BR^6>(X|F#55ZOFwqgP4gAr&&pQOZ3 zYWoyd*gEe+CNyaVLh8azVOm|URH(>_m>ot zcRE)_x%eVaf()Z(g5!z|zN+;vgLe$OGec~-s&{>X86x4)RJ{s$-W|Jg2jb41ibk}~ z(z{b=xIQKdKGa&$2DZe*ex-!Z^Y_A zUyCtc9elr*?#U&IWLcz15?ud~#HlXLPo&1nMiZHtWJ`J9p}L`tr2I7TKB$rMjR>G} zb9P-2EUN_k@X)5gzdB{w&mY*y*gb~VDnNyg%xC9N|7S; zxRE?1{r06s&cWK-8mM2kk{b=8ETNMy&F7kqv7yK=Pf1j!oD7)mvpM$Qw^%$jmQ$?% zFgXEN828_EhJE=l7^%7UQ!@{I(ffU+Zd%^I|Hz`+SX49U{2()RD`X+_#3xj^1ZVm_ z3^ciysCIGVxsFalP(OmIDW@n4kNyeR;mB4i&^}VvUO}AZ8E%^y;AEsQ4`EJ+ON1De zB?wf;RU*Al_L?|$mQ&o{R?ojk*7u#hV?id@B>)Bn-F40)weA@4hm53sO-{J+qIF4D z#CDvTFE0z7W156@B|lDDt+g$9GBhii>sX9rnGiAj{`{R~`C;$xd-;`PrFp@fEaP>% z|AUF3!4dO)VEvPMM5ef06H{-=sYF zyDv=cMdC|f_iLqzvX($_b7?MKNIXTfu{K5}6RX*LCi|5=qIIfo6&h9CXZX|G{Hl4c zzKzaf!m}(mZTQMg>h zOsz9D1)AFwV0Kp$sg_-_LNouv zHt}-%-%5;PM`GpGAUBsYZgilLvbR?}5j1%U8s$$7$`s6AoebvkBf96Tld!!Y^pWp& zS^3c2#<-d@w6L>zJ1bD96sP=?tLg0}k086>i~BlG_OG!(TGqpkw^n5pa~+)*ZuJVUG=x z2*I`PBqyXYE+$5=Nu1=_;6 z?TtWh?clgwp(D}rF+d+)O$l(u;@7Qa*1P+pa@&<>rn^V4L#fAQ&>=d~7#A z$+^Co{{mwgJBm3bmJ0BRudr{>WpA$%!_Z%P*FlA`!E*Gz&^GBFr?x`+<~jA{fsI~y zZwqkfvvCP;K|Rqxt(4*fTnGu-#!+)1=OSkEK9g14;m2VOrd@{DhYL8|{F>8!(=dLKT#F+xe{?hZ-m7y{BI-5t^hNW%uwAl)D( z9fFjUFpvfrA=2Ff5`xs&-hF@XwZHbqIXma*qix#gxB{PW4C&SWrw#S z;?ks!9o)oOLTY(5B7#xEos0|151g3|gsHo;)lO3Gp03=Kj-7*%n%CT2NMag~qGqf= zTb6r}Kfa6YT6cxerZx*n{kh zBvMzhOh3Y|Qesk<;&dDe@K$UF<}y-2Wof~fldyqsaO{)!p1tp1CmtDUDa=gB>_7{| zrb`jZvzjtJX`MXpI4<#s*Q4T~rAbzjb9Hu&TNxS0y5rB%ltAwr;`Lp9S-RbA1K=YW znVb;<5pMd>&vYGMf}=g4l0S`#zu>#s0c!sYYGIo_e>p6xsbA)7oL$jd@sloBw$J&B z7K&AN(<#y0DJV-GuqN3*6!EDDGc>^?8xeWNa!vmj<3i8;QqkMo34C-B1TvZbPICS* zti9+Hq_;$Q#3O z5=P}8>VyaS51}S1M`uJwHoM&?&-}@F><6tMtcj}&4nitBsZ!}M%SlkNAKJt4eLqKp zE4bwfQIZhmWlp2Aqxep~->hIrCGX!Xp$SWR**wl1`CTG|##Ko3esk^WM8pq1ZpeP0 zE0ZnpZBpjF-lk4L&Ysfcq|!<9yvlqmI@a|s<{Uozk_KsJt?P;mvgUW|%+7lE$NKQN znC5&SqA6}!?r9d!zmIP}l{I27HaD|Zji`GS&2iu64TcdebY-tUY}I@78(3aO78Jeg zr;e32Wcz@}Y<8?DyPTQ~tmaPjFdOraAtCd3IOHC}aJ~ zR34RIML3EK#9PRp%KYnpC`^m~(znoV|Fn;b7)zT#p0320eT^>eD%x)8p=Yxeihydq}%sAoME52lVqtw9PAk5Nj(zKSTBl*e@c77l?Y?`oJ;-d5RA-^@q7&DsCI9sh8ksS&`WCpZ744x#}V$F0%I_9@3N$~J1+iaLW|_oVwvNeP#Cpu zTw|d0j17fa4v6O?;lgfS>T{vvnf*t$19^e1FlOf$^1pA*BPTB1!>G9=N>KT8kx!v#DZmM~xu9>i6ehy$(d(sh6o6~<4U?u* z;rFJb^91GbgM!Xjy!f_;?1lEp%GCSrT^kZ!g75Ie8k~ao@6(3&9|=uG3ZEt!67XKK z7)#;Tqfr%pi9p}o5xxsi0TXF~nT;%N8Jdw=@Z1hlsYn!ji z9z#>%CxQ&RQJq}h4kYmhy)oAsIkq0l1Kkm!?&}Q+2SXLWP`FaIH`N0JB&bCGk4kKh z&~rwgRs59To{}+wpBlcKk>4Ap_Ac#L!ZeJrLl=dbs^)H&VNW-#GxB~p>t$`L=j;^f zxPzpG3+q`;^6n0SX>ub2cq)C=f{A6!ra`i=(l31yWj`thY26cmT8pQryLI$^C)IR8 z_aR8=l)-L!AQ~5?H>E_@oKfSgWqDke0pZfA}e9$q(M0U5O%?C%(e=951kXOgb~Pp9N5CnY4Y4fq5AEwgHgH? z!q`MhU4v6T>zddXrHIoEq|A={E-B5qh5JVgT?K1YW&j=`K1{M{;#iFIqD!M9C-d+f z{s|~nEU7`I_`J}V!Yw0V9POGuj=M}ClQ{4j7a6wAX26S8pOK9m`Z5mxatPsB@q8U( zJ$RRZT|d1cLa4$=qR67)`n{_cxgW#mO!Be-_a?8Q8SL=~6+Z^gI&3@zmvhfH%Rj9u z{d2UI9Y*aL$c}5A_D@d)@HEy`>b~V!q37)f;k@`&%wG)yU%`8Ou&UuIu_hdMfHhV& zOTtU0WR2MF$E+KBLbBUuix^^(_6GW2Yt9JTgJidpwf=Tv`bRt`T-0#v=KFq#T;Gy7 ze!e-1pQ{xpo9UL*M&`kHm|@y2rS)ax?zfXsYh`VE<}4r#t9R)|p9l{u`e!aWwP)5W z=W#P72&mEACFzC*94_~mco*cfPyo6nm`A=-ndw1AHfhgBqQ~Gsp`~);w{XpIQGr-~ zvXUPU+B!9gPB-=TpII2Ls8wv+35>L%^qaKa+XrYu1)n?8xQC(1TmG}^tkB*FV3lNA zNDD61^y==27F{i<$HGJMQoZ`OG}$kOaP8>w1=yFTMG=+W7DYf-N)J-iwNV?vWt!M13#(7^9Cr$ntk| z@zt3|Q0sS?if982uK2pfEuksvnl%zGDtS7lQ+(O7(+!MJjgZ`wwbJQ2-mMY@tiZ&v zn}2sNYge|?clS&Km0`pJ$FkIVQ2Ax2Wdo=O)+U_i1N+_w*UR6e4bd zmQM3jQ#kZRXzu^RJa|0Cu)AQy==yc~0l;H+VXZ6f&o*nHmF9j~%~+}YpO1?;?bXjg zKp)|-Lup{Vd_Yd`9DmbGZOSWNbntrPfNptASyK({m7w+9^yfHz~Yq!=bZ0NQlo&7tkU zzwRA9mL9f_ALe?w!EStd7iLO#_XRH31at?)vfua3)PHq!*J0wBW`&*s3SggSATM#; zbM9%?X_k9p3G)taA+CrsKnRS|W&&%zLi=5$Y3wiG*oT~IL@1V6OlT8K+-53b62k+PjubeM`jo0g~M&Vd%%d*2?C>qUA zR&plBPQ{5Mg1RX<(--A+RDaj#6y9U@pnVKDW}n9bi@dWHU)Iz2Sm3WF z^adcX&^s0C$hX>^RJPOd=7*DM+y0p6w17e1m z=QA*uSvdA}iu(L+0mgg|yd^r<_VV+g`#fQLn@?xJUy2qk5TTU|hIhrjY)e)7X=xt^(V0+Mu=ZmJp8HX>Fn1k< zRQRW|o37-I7ws(oK^WRv5dQ%wgq(?=es(2|fzIit{Io7ebx;XCWosfghNJN+fx+T` zf{#0eZu6yx0+R@>&Dy^D#+mpqYdG_5(EY6NuxU^~d3zmH7u@ zhYD5FHH~|I2%7m}R5zxPB5xp(KW=`vaFQlTc@>1?&I`TTh5C9JEcyaZK-TCFID=Kq zep{E7On0^?_Bs0M-wweG)VNyIzti2+O3N6T)RwdMBL} z)(7YVpn$!Zf38IDTt;iINbr&)2sY{4&YF}UqtsYpGNZ3;+3)pjiV2fKPzp%P(@s&g z(0kk}m>62QfOqbaiz)@{_cN^TJxG(^ zFm_l^Sm@}R%`OrcsB`>D>+Aasu}Z~X)_=)+XM!bfI;9avChU0u3aX1@<9-$>3XSo<$58GrPtp<9$wEWd4X*3>Ee`Zi{P9aKgoCc>P* zr+s!|n24ipWpB1+&~s9(OnVn!9Bj)5Jc0*;a&}^!o~Po`5V*9F%WF zHL9Q!^FOv%4{l)Vs&^@fI#uC&q_L)z1ck&+3z`|J-G)+AT3*^0Tpl0xam^Ip*&Cf4 zcx2y-LvLV=nW&jd{d{G6St(CEus-BJqQ*y4;UlLBU&z_*%^d|o?dZEfVEV%H`^LeC zumIA>us%-*@~h7df8;=n)qS7tyq=C&MBQlO$KhGa9ldkK0_OGTq*$FX&uKt_jL;O= zSKBKw>Yu>{hO!nhE3Z+&MZUGs&oZrjoFozwH^vpH(KCK72i0z@&x=BPm!Ba(2nGZw zO(PSrZ5O?ur0(?j+qh{X+Rm~`WTQw5OJE+3OqvcjszMe%{C&A&yoYu@u-SE|ya@vX zAi$OHSBjMj(TIb#D~r&ID@kPWni53W?1jIS!{yoYiUXT+fEy}9l$KHO+hg=>2j%M-n|WB|>4dP~X+8E_a0{fpBZT3^5V6uVX(}Z1&M+?cH}~)MFOURh-FjW`6E=)+ zn{F~~wy=At-@CO1Xdo>~gZ)(z(wiBUl@bW#zZ(Xv5#sX?1AiHAean%*gWbvw$B<|D zw{L|*XMZNl1D9s@w=zX4i3)*T1dt@+;Z9bCLub$Rsppr)cN>8}V!MB!NN)o(kY)oT zK5`yie{fDL<`9q2@Rqhi;6~i~L60Q|3zYtge~YYM@ajQaZ&hNu6BOI4M8@efz`WS% z-U7oM&~et%v++rT8i^4~Emk}XlU-iNv&uC93FmtmOD}QXX_rh(--Aec;L>%4;=xmw z$HS*r5G;iff)io-Uu;ifSKoiMy;-?{$|zbG9c z(iQtGzl^;Dzg=Ssi~_dXCnlIQS8V)Lm|@udv1^h5C)Tz*RwZEsFqT0_C@#vc1#4vf%?Yufi~H>|2)`uhw7M(N-G zyQjN3`lES4IZFAi52GEL`m~t6gqeHN!~Y)5$83y22J;U=HfdiM8TEZEJ`K}fW=y}~ z$9SIoxsa%H=n6}@cYwMHh*>|cC6w<^^MxQ3TaF-hLm4H`WT&;fr86$#^ULKO>!T-1 zF?@mjjBldC*QMYC#wT!0G5uy-K=O4tY8H3MENo7XV0v>e7Z1kvOb{vQc-DG#cq{f| zH0`f{$*WG5e(hzGGZ(MHU^#F)UYS#_&WReGt{>`y#RkBs!-N2q)|%(QdK6&&dh74O zZbE{b{*#BHi`dtR^p88u;f2O*)~S%>=p`-d*9VTge|y+UIKtW=zMgM1Gy+@De=(_` z_Ws2mDVGC4D03KH(oU*6o)l1QN+ZZihLdLmHzJ&lg(2VM6`td<2osvmFe&P)U+LR6%ac6!Ozy2bxO zBE{RME+5hF2@~WlXn%=AUDi7!@Lss2{0s6w{H=YL2iS*Vbzd9ZeueLUAx4iM#_4;p zH-LJ>KnM)w0}#4R;1JZ=qWvT!#CY$Cs6QY0_G;UC7~>}pZg%BW3eLnkb4P_tZZi;0 zq^Kyt{G;Cfvx}OHwh;N`y`tjSG32Vpk+^a)J?@1n%WefN9Q#9YI1pRf7g4 znIC8FJm5LSI6e$Q0x8Q~IFIKAw=I1tee}%ZJ){XYJH_ojHy$VB0T+EN*!AY_TaSlC zruulh3QG^4zb3Qs4mm;Ihabl^eUjbK>f?spG<-cU&ehXK#URW@MYp!%&PuHJAa5RT z&UJ`{!#0AC543vrk!J^x?U$q9?yOl+-99pp(pL*tD6erHHUf6x$^!}n{Q&$N!U9Hz z)YZ`E%MuLg8)DTZVMV~MxcMzW(xvIs$JA_56tVc< z#a5AQeZn!-NU!^eSgjjjgknh-8u{(93B?0c28XK{YEaS)_eUDIfe3JcTG2ZVsf zzh@{#7YZ$vXEJ>C0}!`s?1h6HQ~X)R#3xs0ykNj3hNZ%hpTpBLhD*>9)!Yr-Yt9oj zL15Fc(GD(Q6rUjKvxtW829Tk=DmFL_a-Xcf>I>~0*RdK!H6bwNt}wCCda9GVn>`3u zL2)7i5{p!x{o-GTI6z*$zJiD&4}(W99=rEcj>O3)aqs#J0WsnNBb-@ChfLuj&Pj8gPdRD!d!m&FPp!!F^>vQ3cXDvx+G+Ny=sEwn?~5KERoLp(8lt za0#(`p;9w>yG^ElM_b?dZ^<)2;PC@H_!Iqm{U7vw#$<|=vP1CwP=0vd*f}4c zi|G^*WD_H~EqC0510(hpnvM4byJCpI=tw6GjZCH_nSQrkbycMB3d1kV{&6#<+S1_! z%rai~9Pw#{8U8z_Z`KY)Z_|P?mPF=a*SB<1@+Ng|uTuq;Nfu8IOOdcU?3x5`#OFDr z1p=w;=Y}K;L{p&uMz8;5d#gVxB7e+<^lN_00pd=;qWGYp-dj}Z?2=<5QO;UB|A)yBz&WDN;ydt<$WKJ9MUpQK~+7a2gEqd*IoxGvTKM{!mj`F;QWyn{=lm3MG$}TUUk3R z9!l)~*xe`BHV@bRPOc2$@0Ek1<~e&Y@j?vGF+ebJ-{{U1V6@1lx5riwB2=_X+_by| zgAbdNE(EA`_s1zlSXWx<2uNW(P1-bONdP)*uP<{)u58y@HctaF`TkD&8?J&EY75B2 z7rBm#LBApl#ndMOLI-o9of*g{xtK<1szLZ1ie19lFh#6iv?n{dWYDv#M&IjUcro%A4^C##7A(jP+g?PSZ)K=H}r;Z|P8onv6B2bsy?XTH=at|s+ z3GI0$SO)kn+kgPx{-h{-8{EJr^@O5bo&226IyYWhC`>d|Vw*F^wuYWf;}s+f-O_9$ zzT5y%L{x>MT-xJ{^n*@{Kg}G7k( z!e8zchdS6o1Oe@KnRyR(eZuA9%ZbJlYKcBeNHGu&#iFs%#SB0%|E#ir^d2}HavK5@YWJ5(;$$T*Vrvc+ zzQ6yupqUuAP4=b%u^(~De%3S_y_tcjAfcuwJ(yO&I)letfFIT*DO5~3d*XlGOAj`~ z%=-ci9(f!SySdo+yE6@cv}RFBQZoFne=-OH0f!*-?=tx4i8tc&$YV|iBluH6m7rUh z=J|y~2uS7=#*18xw_(c!6c#sLWe|cu0vViNR8dM-o7szj*#PP9S1OrB#;~qHZ|(&+ zDGZ$AP9E?JLXQ?gV9VGb*-A*ozQ;+-t68F>%os&;5mZ$(GCN<0I3X+Sw3!*m9Q2ty zo#AzWw~D~Dfnc;2FLqrZq6$XhVKceo=>&KXzkuMI;8!o8oU0=oPIF?WYF%*uYg=&5e672&>?qFog>n|CY|Bq zM~YlT$~sgvyn==7zF}OX`)A98R$kx+KCMp#Wq7O4y|})adw$u1w$FqEvg6hzY1j7e zkYiexbLhb?j@H*^y*X0a%nkhW%rPVv`2>yTphY2GqhQi?NzZV!uw3Qnf4ZMTMR8*L z585XZI&V2&@H%4iW56RI0D;{S!$FVMDab-wr~Y<7-#I4Q&2oV_*;frL5NS#IcImMTbAmEvX!}M!4l$TDvoPwQS)+PZ((O4hGEp?#ssmJUai&+M2 zvWUmodvwVBoD|*tKfZ?RsNN|8n<}Ofsx85i0Tj&u1XQc+l#zzJ!PPTY-8T>gPtk~{ zCAb!2bgvd}zlLNW5cu^X$dp5%IsBV>kEkX}0?h&e!Z8@;fHfIdIrxVPm4iAlN7fit z7^r8&Z>HZDvjc7xy-cjYI5JpBHKfkWyJoV@%%6%~-!Db9A0P8BH&QWR(LLJ5YFRV) z57>c|e2bWFR33zB_%(eGtVby7Kv2DIxW1f641Y~VG%dQ@A05;O^68r2pUmcE`+BX} zBI?soXkKgR!B0y1`;WvL3`aDR^!N_@=V}9?PSUSks--6gcMKxv!C+rj1c=H$veY)~ z&v#Lk^x%lx75RhyMm6Gc1y++Pbt~aK>%~$j^a){xV-}^|%kzxN76tb>iH=BnnJ`1Y zG2M%rsN*ksQO+TX#!uOXoN-OC{$sOL*!uI^Xd*GIdQQuq47#E@E)4VyH-PChnZ=*6 zUM_AIJ5qYpy}1#@G?A3a5^wf<@Q3d=pJ}PU65sg3ItOVbS(F1=ogzp$tY&)IM^GM80s=Tu#nbcBhOiDi?Ig zdY*Yy*dDfS+DBU(})*G&OT!%FHwp-aQzqo=GepNSwbi0Wb~GxdY}&a zR9U-LB)9v;4DJr2j#63QVOSvdh2FD zhqbihHe60Sd=uuQ#DQDd?uYjd;+7)|PL)>YWOhTACpyRT2Fa9#E6pACE%OQg;kZ8aZn)s@+@M3MjVA+yr zVrrRN-FjGk9Ola~PBD>lZ9)xoY^LrO;!J(hbO!|jWPnA24@F}QI85mcqq_q;>4?cW zXd@{=0Vp33>>YvI18+9ULypL76>HRmYx8vZw2kM@g0W>bkzj}GT9(29u|Nisy_yTs ziawUhKElaqV>=Pcw>W_*SX*K1QzEM2^`v3h+ajtv2If4hdRofgY?SBChP9d(&~rE zZL2EE{vSW4@JXM7J(=*0O_dc+aVqw1O~;wo0ZTc{RQwaVT7)L)m6~(7|y7)m{9i^|_t400K zgK9t6oKcVwkwqzkfB1Fzn66^v-w#2Dw6R2Z&*so46t~+5LDAuagLlZsKL7C<_$$%* zA(WA{@ad1ck+*NrKZ7hl4?QK@H1+hd6=(Bh!%mZoSaeZna}DU2&yj%|RcR_6I=Xn2 zPn^_d7kj0qz3ZRcqlA*uN2RSAQV2iEEhD}E5a93kAv|_nL-XPKv9wdN3fjTtg??BW zW~UsmuTW2XGk#Yz02!3l8NWnjVz=%orkrlDxg~4_y6EF*_R82FeRoCGtQuK5n%{`V zYLj2pAp}}}>1OFC*gO-0tEm^jjOit$rw~K-M{7+fch52qZHk@r&br<2zERGRyxa9L zRfb^;aqgSDD5T6&e9plD2WBb@~mmwMSQ39FBSDVDZj+c?73?JGR-H0UO-BPW&Cj%3X)^}?z-q|5$A5d4Ni!w;Y`gFHgTI>f%FCfaDcQZZ$k<+|&iBim>fq5SO3*V|3`1Z(jI0S$CXPhG2kEqsmIKQgr4KHYSp_fcg^0ACM^cf!8A^xFkEP#LU81G!iS&dw9Zbjhjls~d?n$zBi zjllJhRfZ7yN4JxYO&&N4#(7T)9g2@z*XkTE`^K*gA%8-q1Y7~6gT)Gvf^yfiXtDPV z=qoWUWa5aZCxK)yN#>A^ObsYJ=tIbI=P_~Ce2EstUG55&r)`aNV<@Ox2qrh^D!%em zq1-=&Stfqk&0chndNr1c7Zfvi$n)$3zacOUEI{Mlc@u zeO7~r0d}W!3n*-ta+~1dvJY{Vp%T9ToxO0@y_u4_PyM& ztnuzS$K5`*5({{T1H9u-SNv>Y0;4$Tl9KrQ{vwO+D4dNJ?zZaTE5(BP^PQ(*O<08F z{gG#4VZ+jndsdPolpjJ8*T!>vCLC59H(fVSFdaLie3MxV-N(M_}*L+B6Kq>BN z2adK4h{@|>(#R6um_OWx%_j4reE5ZQLc(`2g-rgME*7@9x0%yRhWDk>r}1{GY^6%_ zJild*<}f~P2bmSiD-h~JeVIqxp&=U@A^<$TP-P671S_E7?2NPvhCvR2d)y%HeT7eA z!^5MCGzTej!Sd)R~Wx-GN`27u4p;l+)}Ax zNoDz|GCha_m4s&1T>C_)s#xv}Q9SYaR3#CJT&*`_hP*=0Yn+JJLbL!aQTZ9^GE^(? zC|}kAu!V@Ajt7EDs+`>5cWP1x>3%VB-XHF_qbhJ1iE7K7^AKZqdi-fOP(m2-X!l0HyW&{JuhF!d=@i0T=89)#8o7sLDMIGj^ywk0jl!VEtcfYKzw`LDAu+< zw!G++XZ35(hDdt~P5N;uqSX(eEZkuGNGI)SUb06Mwi$!6u{xJyShNV2{%$yvy^1@L z76WJ&j7GlIJ~uvGL=J7|ZNBBo>3$_NOOev>s%$T&H-7O`q4rOSulK@C99oyFb(KHr zO~IsI{OJ==CA(E zWj{;xaT6e!C3-#qFVE!vxt))wA#nL3z*fsd&5BaAb3Bi ze9xa~gRfZCu?I}8LmEBSD@I$HzN*BVKU7HpW<6PxGQWo9M7di{p+sMOCI#a@y4{M0 z4J=r2W|Cv4ks<~E_22v$kN&YKLM~ZHmY}*GP|}xS_jvk6mPX_qtNr#y$C1~^_s)EV z=FeSAE^Ou|-^V!?ot4aVDRedb{eHO~kwk+Zitq0+Gmjgrg-X|&)>~jh5)8I9j$2^a5K?J^r1#IeyvBx>U9@g(g+lo8eu5@;ACKiNUe0dXX?)FT6(_wTcyc=h zz^8dP;waGqY>U=0Bf+Iog`;IuK?5(Mde0qQOv0tK@=h9FGosz?@r)a~fDdov&3%?Q zmDWy=g}q4-haQIo_fz75CtobbI{_;M5a|0;_D zT?q#GbmJZItw~b&zJ?Oht?ApVuWUDVxeGUnLZk_`_|i1ht{V6zhWPS-x}bb@E_tn$ z7Ab3q9U=7Nwf$|cT5AM{0~Z0?an#+Z76}ze&k{L!X&VM~bt~R}#zsj#=8jUv)AX@4 z=+e?=4nhH+l~)7bI3DHCc(rfbX2#R>6FJKQK1kT~@(U>pA7svU-07OWD*+HRPb1cp zy58u;$Frz$6Ot{;l+e8TX&~T8kB0Tk>_lQR^T^)*0IDXs>P2i-iimao-M<8(6~`kl7azz^%as}H|$!|j7Do?bk?=83%&=zV3ns>SfM zj)by;FWao;>pP<86FDyi!reSW?vnW@f}$8k_-sP-1NlVD5f*Q#5(wIZr{t05O5anH zeN`jA$cn!*3agRN?Bw8bSn-T}BQ~g1E&pwR_S`o+f~#@BRGzWJ^K#>Ser|PCZflf| z3J)WHU*6ydDq3SmzHOb{F?}t+&bi+hQ^^=c1FbEAC;^TV1=&{NR8Lr_Yd!eHjD?@6 z{%kH`hD6MV#kY_tk|0-E#+Ce8qJDB2M(71KBiqrPMNowqd@b6FOXW(vX8`(C>SR57 zw{Jfq2}eIBq~KgS+WFn@>epm4B-kj+KO27-VW}$(&1u_JPG3*{0Wis8BgR4Zp{GGP zGPoL{I3et;{Up^C?X;*Q70|G$9TQ0$Q4~CnFFEQCq)!@;LzdyoT+_7so}(FGmWKmH zap|at$w`8iDF0;58=^p$!rUEt!CTKh4B2FEDc6DX<#j43M+LdW?zQT8S;`}9QkMm_ zv@#zpEo)HoxPdh?+>u`FKd)+9&40QLE47uR(k13795-En%&uc_3KQ6iX(sr8)65M( zWBU^ir~$Otb=QQiq_x^Y%F>8aGoOj&NQ?_VDTPyNw8WHFfB2)yjrd70`vMN_q4|;D zX0zu6P*{pdHy|(8loJ0LFzQIv);Y9Hce3?qtZSex@~yVLou^~Jen6MS^KW|=dVL%# z2gNU2H1>IIm?%ctU$%7ZS5qw4UE5_alghUpu}9M*{8K;_e0acGH>RG^sAt#Y<43O7 ztXeoMZ{wISgIl9zWN?;hb#-M)-xCjv)G84+X4z5hWS&}E1z(;AaHn-51k?eU4-Zx0 zsRJFj_0OpVD>-Z%=bybG)*Byj&DzlaJW6CI{2EteIi@zxMZ0m3)I@k?*rB#*u(*DD?Aux*u^xlnLY zh-?GB_fhT45y3XNqV0dPKk@t%NNc1W{;k^zH_5%On3MSn8GF)RnWXFQ$pr2sBE*dc zmbSRR=x~ch+}vIJ1J-1KbIw|9c@!1gRgZposL{tN7gXV0nrudK((|sc*I}gB$${Wu zME$OE#Wn$VM}FcQrG~@(HbY+CjpG^71r=!xebs6We=#^PkR>*Tx6A?ChW)Le1y!88 z*p&(^Qyj~P!cQC2i>=x~y1UfcH1^#`S(E2`UUs&eQPJ2hYsOO^Ur$1F86H?(rA5S_ z-nAKteD6$6z00?rZrrjzasG_$QQ{;wY3$)!J_C?{&YPhDv-4kdg%CrY??rw6x!${* zHvszsDtAp*^sGoy+>JprQjx;B=@nh^@{34G!Ql1b0?5KkY z4t21!@L68qxSf$aadS7s?IiC|#DaE|-M{*b)^!pUKH%vp`3Invn^s4bVgn-{zHK>57RN~+CQcNICe z4P?5EfAQzLaRCg1H%T4$CtRnyASl1*3+a}D_j%l690R_6VWqJ0K z*Vcxm!@L&g8ahA63niB~bhk(2>$ovxGN#}oC-4Ng4i%(7V;f;W>} z^n=7t4KYQ^ z%i4n|R(c5IVvP9!FBRr>r+k+~I*Zx-n1sETuc*DpJ~5XTC;bxmp*pac7Pd~YKwL5v z7RJwPs=_Zpi=WdC_zX1wm=MVGv{gjS(P?`W8K+PN-`vJLGi~i-V~Nae*1Q+qXkwKl zBBd3vuY5G7<5)0620T(4<)x?^D{TF}`l@5cCn7t$*0i{jUSN2RN z-L;QotwIJIDlbYKzeYM_7K+F1axE?~mc;TRmZhnraSy4!3@gOmgjZ`|LMW?^m7Xdt>$7c-z9~AC zk5!;QKcJc+w=8E*Oh66tXC0L(BIV4sQrX`ZEN{8)z2J}bz&IJ~-Ux31fo&*U3DJ9z z&2-~e4NV_MxdaTSLF4eJ-k28o7{&2V4tfdnD6WP=s74Fyho4%P@+<3?j`aS?|GM)r zA7S-(;sx~JJG};db1lEG`m|$CH7fu)c^n#pr-WFAo^C)I-E$t`>f!yH{ZVG zx#i8X5z&97Z!M=SQ#C8BMQk6$*vsf3w{fU*#K(Fh=&@wCA#d>Z3h zX9^Cih;~170{aTV_Cc9~!r*_tZoS|Oy?hyrq(H-5APOF+ zV^JejN4-;c>D&K09ds`c5w=N-V^G^gl{82~XF0YO8I5rUQn_L#aS5N5TQaCzq5$|0 zJstCSTfzbkT$ud0($kD6)S!Uz8LfH!fsdX%_OF#-miuaR{kgjXKiE6f!)>EN#L`ib zAYp+)T`Ka5-twqRJqbbjd$KA&`xg}rw0YCP9WG7A?N&k!nJtE72}hX4D-!pGbEYZK zWUA+?#Ik8ogWOqrgNJ>)@4h%Ey?@UPDnWE_zOaA;=tva!l8N?G-j?dzt+dpg@*l$5 zhC;;7->VvE?`;SiojtwyPVy38W??Z-Gta}n$pfwl+=k6aLB2iqP(tud(I>07g@{O!O zTQ$k#WZ3E7;89C@&XDJ%l>+R=o{r3kmW97hdOmfOMYRa*ovFWOC*u<3wgSZ#DhHQd zF;3UCy83eyr_BY^Kl9MrL2(1iSuq%!zU#QnE|8UePe+`Q=!fXd3lc5baBZEMJ&0+2 z)k+h*izN!92&qKuS#FN*Rn-uHF5 zqSq}~`t*H~H`z)2H+S>TtKaB;4^Q_LZXHaiA{j|A-ZSqCiGQ9rHtkz_F~ch)ABOs> zQQYgk?(!6T6G3{hRTgnfLb;Xs*YXA(g?F~G7TA@*YL962Nz3kJ$;qzK&@*H@APAJ&)N$TR)iYM$ihFPcEDPc4L_FWb6U4#v;V~BBG29ccFFX~CF(oWt0Q=X{w8nJ(@9Wi zp=-2(7a8HTJ9`hV<$~!t7r;v;&!n zRzTw(K@Yt7a98f-Rb2+#I@IFqetu zY!tIYcMK)6B7=yL5Wk+n(mPUgshxRO7Jphie?};f*TQ6Le(OXUtq{rhCa*o6pB-}M z^R$RpQ}hf3xwwk8AG1yAifGvqF=J?p~@1%zX* z7ZCU)2=n7y@umw3o_c^DU@Du&o0jp&LYzTP$?3UzmBHP9@-}n|pW?Rcic7Ti#;}F&qoAN$8=4C zW6z_Cr$p}VKxkMbXG~}K`v(G)jGOgsF|o58un$eVSZ5&(I277)6@U2|VEP5M-I;GL z*xvKfsn4_62@AQfVIMr^d?}4L9$; z|Gsns3qiv#2I>+_Hj$ur`rI4XN)YF;4Uks^!M7Q{-KYX;>Njq$h%~I|b&(rrUPuXXADuO;SaEeIs}n~txNtTErAKu7V3L@1gj0`7KElH${EOR6UZT4vE=+xnwAg{5(5cX2X!y| z=jp|o>vdPF{+>7O6hLDnjcAbMV?Fl@cD_uyb6Z9n08Q)=!{jpmIBIjf7bETGz+(%= za+doDKpF`{^B~GHm$Y6FJ>M_!0va{@++g*-wN_bXw#zFO*FMWP#1aPgYeq(~o-BgM zuX#Oh&C!)=pAZ4IRbx$B?0BkuiEHR@$QLD`ex)KTF8gn7A;~4d@(;?I{a$3&t4f$x zPg#{pP7;X3y45;@@15&J!{Lhu6(4X4&?JQbYy^Fv$tZ&44!lQ%j{HU#9vWl94FwRA z0rbXoG(PidR=BRR1E`>9+4#?P`SdiYW{0CmH$~2Q_cnI={t|nXV6&ZzY~0{4V#)!$ zV(*HZ{mq?kZK?`b$~J_~32v1h*Yxj2fK8Qj@onl>=JhuMXh%#HGM}j2X|;C08$CS0 zyM3Hs>XG2qT?O*LN>ZBh5KOeDMX!s(_(H|GYxiW@@5=!t58UX;>RlDszo?yMOkX3s z2xLBp<_X@$qdM^=ML1<_m;dEQjN?ezNV6mAZ?AZ*?|W}fdIWUBg~#E!o5q0MxO0@l zGA#6li2Vj0Cbg<4OX93$uY%@cD9^M1D4Ig`UD}#Pg|ce@kE62=YwCUY@Pd%;?hpi# zZjhk}0@6sRlpx_3X(S}JAsy0<43H2}Qfb);=|%x1Hl#y9KwvPo_x#>#|83XK_MAP> z&biL{+@Cw{IO6SreGpuJztWX}3$87iJ|nzcc~Er^p6e-W@$^Y>ch8(V1P5e0wk;`@ zyt3-oXA|~0aD0c9V|Dl8(#E4~j~SBQ)`lj{qU_MO!>gO#IsV~qJufn_7I_*C{*y_n z{K~G;`T>1>I})q~3ag-XF?n^@^c98U)CLhwwhXWqb-lS!3+9Q56C~*x;&>Qfx0#Ry zoqodx*WO2^wA&vY-2614;S`ofiZD=pyqWZ}xM=D^mHgLtLiUt^{_^sTokQjg7GT1X z1Mifl6^vlckc*=6XqSmEMng$SM+bjlB3F-V8!BwU{K+tsf)r(1!;Ne12u&S z)N_OuHX0=cZ^nC7enn4S5>!8;E6!Sqd^nhh4Av2EPl;;Gx7naCUiQSsL2)+^nL-%C zL0zDM>*Uc`vKMOa9$bt|M!Z9zGUD3K-)w}1yBuzOoWT1gh-UsP*>RY<_#+=egq}C8 z5pP^^sq`8R$d%6NF2mp4gPng3YF;ks#XB4T#Fw7IWiqiiu=d%r6)uI(jtCg?j1?Dn zj(^_ORwMjR!S1zCsjIZR2|J^qYAcLonkLp4s&MDeDjFY zcQBsHBaVYa4LlceSr5J=FuK=Z5WsP+kH>$^l>q+6xKil(5G;+j+nF%&-rdU#SwD(S z@r$P3`IV(xM>DxE?hk)nBA+^7i5LLnCj@0PnsDiAw7^zGIK>;iow_fND_Dy?)t3ud zyMim$EDeaa++El@TxQ>D$%o@LE#Pc$seoyh)js6Im_m?*~FYFDpCN;%r> z#PY0TmtlZl`#Rj_DM5P|$a*_$xBO{!#&tcoil7W{Cx;2GS1Mmxb(!ONiF?WAj+e&M zcSBtWar{MLaep!noKXmg@iggPs6(5dF5mKRtAZhYKp4m{`eR&yg;OD{l^h_M@0X!g z^Ju31nL7tPTiQp7U6fs*s@|;?tz3+app$)MY=!EsB>@r+KGl=2r`&@I1-xVkK^yd3 zb9%krQB$}DBiB-}Z|S>~volyrRy+%!P_E5ArX6Zii3;KGxb@B2-Hh10R&(4dg)eUFO;3_aLIz!rcWB%g*b z%$Hm)@EM?I9CwH9K$0gVrdz%Hkhzu-c)t>*b`tKT5!)i~_U){#;7^IidzG!i$vzF? zT2UmENaF8fhHDNHzb91K6YG33P$IZ@pi^2iT?8qUYVmgSP71E8t6x-hfQMLM(Ft3- zSj+jRwA`$COXsxX@ideR#(cJ(T%L%OEM#2?EN-BmRtEJt=lvN%2eSL@A}K zx*cc{P#YJInqNQKU0;bQptdbTZK>sJ=~6nHh7bQktp-xAwOCC>yVu*~G1*Q5?2g72>{-yjUOlZGX2{ zV_RVL3X`OW*|^uSKX3vJ&I#EvW85IIYw_6K)F&4ePjDFXq>LW^+`3D zD1^@->%P#FIgQ{Wcq~qCofdO*!?~8iep&k7P~0`jsPHt3nxQnW^Iew9KTMA449le( z9G_ol?u@Xn2%;0n*1jRIR1Ow%@94iIZDI}K_F``LM|JJwQ0j@1c!O*{HRD&Z%Dcz` zso3pY?*sk)8owX&_li-=25><_F!k1pi2k`wj>M`>9+hp$jqoch1USIHK`60k+zj74{eV>#3|_*l+KP-Oi(m9=kzT zKkdE9^O+KAy^IH)GNeCOE-R$lYn0j!fdNK`kk`r+D%Y-phtMxYV=c%I*e}9*d%Lak zsD|M^xtqT`wo-o%EkY!(O*$Knnb<74-T55cAxU0}Zd(3@v1oXOZRnDI8))$N6nh-5 zk>kVFd|8`Xk)p^{ciFY8Q|(qo6j}`j4*eHMrIRZ7YvPB|UPJG2qWyu-W>q`3gH*~} z)1u>t9fRGT#z&mQ$qTAyQjbN<@zC_5#o^hdOzPNAuvz`%;5C(J3W1m_lGcUG6n4#L z3x31$Uk19AM6x6f=nzQW_-g#cal{Gb-e6Q#d6&!AX0y}%Hhk}az^gbe?{IDZFx$u( znqL&x&p6Dk9+unVI=EOL)V!RigDocksu48Rp&rZ@eWgWS?H|$4-+K^7Bz@P`@fI{X zkZ}3^0FC{ieHzV`@McG&-x(?)ZRn!;UK6U^RKihkiS9ekO-sXbo?T! zJM%Phy_WU9Lw(ovxY!Hk-)9H3=@8}QC-H9^?IHsbwH`z#ws$?hI>?hOtEx((j^(&F>pUtdGx@x)sHi^{|5>dtF zH(s@TRJqvQdsN$q3+?va4Xeoez}iinwD{+@NnGLjeM`}o%62vtFM=Zk@YUw$J0B;5 zMbQCR);lAna`r>_es$n|;wWzE28?ezw^Yna=}xB6C-s>&EZ&02Z3~DWOTlI(&fzwR zZpl)kuG~w(jtD3;dACgyaY!Hn*&Z@uDu? zv88hB_;;O>Tj(c~#mZ}eaqtD8^q$<>V-NTH`psu?XA=G%#QjoyIm(wVr14mx&lsLVC~o5x~G5bmQT=FZwatWN{{&OBjfnRuYh5& z-KRKLAiIQqIRg>)>%tem&%-DKdDMWkoT-?SA) ziGUmp(~tJFjD;&<#(tnT`!nTcRcqsnAj`)-bLGB~;w1a@VE#OC|GZ*IkMP0Yh{QD- zLZaWsmTARc`2EcIl}%N=yRlB#nZ%JPP2`vP+b;!mPdm0z>+D92L;X7^>tZpsq$vnL z5C*m?%dH7$XOcr9O^Y(*)EWJ5(K7(0(?a#2WJ`)vwbV+;UP_<#SvCt0*Cn&f{3pRs z4X5}IGj1Fc>HgIk@e;>$Keb$QRg>yF!V3pDH$7<8Qq|!wBtA88YLu1NkmY1DrQ(wqSn9%$|Is3D%GbMgzr#qjE8(8167p z9-th#`EL!@*r zi__~})7UvcKU+^h0MHEfpC0Rt@f?930YTAKy(b!YJmq1A@s8S3m4Y{A2*IE1Dib3z zOcK#6X&XKNpzzL5K7%xl(4vflIQ%V2d4e7^(3TeC^s}xLK04+O!LYwgrdE;p-?^gD zHi;=j4{GOLcr5o<(b?#B55rr~EJ6hUZB|MqIysplwBe#$p=KjPBugA!zKRecpr5Z{ z_3Yg?oR$FU1X)XvoC;_y@in^P1;pDv@N(SP1c(J;VCffT1jyKJPI zg!o_Yu#YSPZcGUDyE^5BZuDh~>K&uM{WC?dao49YXgbaLFr2#)QvYl#tPy#ZikR^WllwS{dLs~7 zYLUs|Xy3X%ZTxl@n!*8UisWir3o1&fF&2BNi4Q&jzBHr`!NJQ@*mO;6;wqRc-?aBc z1<8g}<3n(n@CBYsJXmPAV(T;|w*>t5!h7&K{^N4f6q@{66*UnXF%qurAZH;sk z*pCX{{7TS13pEl`R{QgH!`_brW1T=$i$%#G#eWl_gA$$6mD`71y~6Y_7$e56igBN# z@(vZLK8NfOSXDV^vEpl3;VX8}@`5r2#-sT`y&7j_jgv0PjYjnAjxBF5!Qa$PN{B@o?xz^94Jy zEOQz!;-K1+88<>gjA6FVa&YZK5;dU8WEY+Ik-+2r53?@dHiWR1t|LE0C>%70>O2%e zBq_p8k=G63(sbCD+-V|Cl=GHQS*njv8BR6h;YsH>Bnz+e>{F#Bd!Vhch86xfBW5_}6duI=J;G-%G zgK*ECcIP|oUZ>sJ-VVMbH?EM=HR=85PALr7&3}($2zAr1q^2$t-p4SZ_rdO3dy{d0 z&q>juMoHVyZ6E)mez_z$a*{Y_msZC7*SiM_8-(RNoz=5g{K4##qfUvpohBGJHS+!+ zt8bLzzw8VsIK6Grv*iiyu^(Iu%F_vU^<1-tQ4q-qM%YrxuHaGh!uk6~2qoY^V>y(8 zXj@wUYAj0oUdi(K1|ZX5KF1$;Z0>a@s=Z8_ul3{fXgs&;f9Vrdg)HNGq!7n^}pL%LREFe?qAV z9FCDu`z>!R44%onc+r<2p^cdyn!r407dU+e)uyTPieS<;0Zky_kUH0|UZr0<`F2%T zhptASPgvrx7&KDcf73`RIAjy(R`+DQUx|1_RKdVRL${dcCEjXl%z{tJ-%66I00kTx zIfsGdyW?BZQmI1s4+xVL8Xes<$E{}G~yNJqi%6#1aOd_v=iwgVcEDq48_lMtfx&(kA0DcK2 z4DN-Eao8%XTD2oj3El1#fZB>DMC3@okkLFli9#A-LcDu7}`12(d_zPhxVmg72gO%y@p1O}uP?!Ftz z-NRGF;%Tz40U~KSI~@!VcVHq_Cs~fGp{?Kzc5|Xvyn_&o1^6fdT6F&!?*oj!P)5gn zX|=C6=rqlk4~{jaG|pAGNE=?Q&i* zOG#T+sg~fTcjf`HT{GnoZiroB!&;pF6XE+$DA?+s@`gkE z#!2Yu4Pyxs$2?ws)p340SHw(C77PGj6C-p>G1gM@)AhZU&7^TElxRZ?K@my5-D$DQ zXO4$drWFyT=|(0bZ>GEiUnP>6vpGg`JhJ3t70eO!`7|uxt0BC9BKbs7ESWTtg?eNV z(&v8#jfc^cf5yR6I*!sB81(q+gAQPnAHklZay|Nax%-YRT1`Ba5Ee+zYCrsPs=NsH z+6dNyznPSk1%dCZ?Wm*6494izekVxE(vN}}5&~166714O9t-@ioZ`d5d0zv8n z8FwPK+Rx0AOEPoR-dH3(L|RLHwMy?8MJuyw$E5VkRD$HNFRa7kFmQii4l`q4?!`PV zMCjc(*JA&Mw}N>c_eO;aqB6?-x-HH!1eo@X`*HvmxOq#%VeChnz>-uX%0*GWAv_8ms@IzHby$MLLFSSxwx09si+EV-06(~&^?d@G4( zb?c0`RTsF=VsEs<8dF3VZ@lyAKZ+NzTT6@sYT4XTKSAU*ZU%61k1GYRrVc4F9>)w- zVt`kcThuVf3}G#(;Jl>VZS7fd%$)`(-Bup&cg{%xe41_ImEhh(fk*_Z0dP_b6_J%) zl`a-%O)r`pm+LI){&msBEA2ZrT^lN!7cUpsE)J$B4l8oo881f7WVt&VTxgT$3I6Kc z5~y0`$1{9R-A6v^kiKEr^jo7xB?5=lp@_=-s8EDhkndem82Awc$skXRadgpU6!dBQ zRwr3qoTty6q!lX~iF}^XEP*LvjnoNsR{}A;)z{eY-<~xc8R8pTT<=l)(XFPp^ECR! zT{8KSHkHZYYn3HtMuTz<{Hg_>hPAs1jwUgeJ!qY#mNyVjSR>QpcQX8UG-B^K1$IX>)zqU{Fya-Gj0-?2r zAewFY7-ZCarb%D?pllKmyxoT+jLQd*!B*T*$rORsZ+`N<-i#C%4QhH{Bh8ZJFYjEW zwN}oOBG)FRaFQ;=t~>03V1sq^lhm+aWXTOtnun|FX*0v{IBLt!64l{J6%j_#kJsin z8T%SaYz!`ehIzN)|Aari#7Um`+@vFG5YlHn>~P+IRGyGmK2r_A07YOaw}ZUb{M(-L ziaa+M@%6)01z4NM1zjwRH~$cuYB zHA!wMxylFL&<3lIS9>^*U+V*VEP$RRQh&VbQ;Gc0CTj-mDoa$x6~beJ7D_BD(do)MNP7sA>XVIJR{$$ZIt25SDBF zDgK&`OJKpYC`)3|`m<=f(=46r9XAe&z>F0it*kTpHEHop>51wuwHd^AyTX}wZRYNS zRJsq|ys{AqG>ViQ)g*ryo$UDZ(O%->8wYX?sjB(24?_xc0#HAn8hH;9kbfLZVb5ZW zEXHNKOp&srr>j}6gxhA`>BYXO^v9(0z<`*dHn|us&WcLWlTU}_*r0l8C)(|_{ z`e=tV3%xkY28fv3Ds@Iqa}mE|9K1r+VWWI(-XwDBg|3eX)$J3eds*v%B9y-)Rq{Jk*xnmdz|5tzSf@KjRs z?-TbTZ@oIx1R4Cpob)8;Q}x^(>(WE1^Xg%}1Dj)n(yX3iJb&Nl zBZ6Qex5~#jN4X~Hxl(K%FwkCCZ3z6W9r^H_4JL2ig;3Vqs4uTvqPNdp`M!@7GWa4m zD5&(ioIkhWz4het#%(orAv%fXQnCh#^cvUR&pP?x<>!URo%ipanBv*<>jlZC3l6c- zovhSu!sHvVb4b|#H-F+n>7v3~a?tF8R5V&b}aBBx%K#Y!t<0i@>B zTrXR1Pvt-lppGzdH>l4l&@S_Jrs0{^kLp*WK89?5HVkWYrI!HtcydNSP3dLv#)TA&^7%iP3^19DrIe^IH{rgd=)@BPD@%0I{7Bb3J6RN zY~MvL94GG?Rfzs0iT{DU5;e%FeYzK|Z~i!G_VJt6-=QQ5jz#|ox?D1SM%)<<730BI zg_+0JxwDcAH+8Q`Yo>nt8!t80dvA4lzaWt|AfU-*ddyRsGcK~%RKKlh@JFQce-Aga z;tPgsc2-|kA_RrAEs@_L9J5P9Had_TE!S_{&;8}#F9oTGwEsLBj;6cg%CI|KbRZES zMh9ces@J8H{P!31!paPqL;ZWbYb#%ZKk}vMFP~z)-%wS;mwP+ln7gK;H+(*Ga;^(6 z%nEU7`n9xwGd_e8scp zvDTr*7ZQKXvOi~eAyS{XVIbYF?}fY8eN+Bt+_KWe&DXnFHY3XT+QT8Sa#D~p<4_M* zdOMM9I5}t3@jvOnANL%EU*B~ty_jmN`+m4G_4Q>*Ow_~AOMz3WK9>c*c39z~lp_(Q z!xq|Yj$Cg9vu;qYLjr#p*9pavtA7)!^LxHnzm}j*k;B~_ILbIyNUN#f7wiI#cmU4l z;Vp!6Q*Hh*qig-~a*B&Qb;ab-Jxh+Y<`AK>dlk;FX`ae>$TCQ?x)sP>dv^A7q z*Ml`1WI;a|H9zX7{^yTmzs~bQ%TTXw&Zw%PUX0%nX%#zqA^9X~zpQ7~_bHUWgG!e@ zikkV(i6Z%tU7w-VsGU$4vW$dDes&`z=kyz+#B_4v!eI3E=_vU15*{w(dxYK&=vk+n zGj3IxphMf-ZnkWGsFwTvt30|Ka@hSIM4A0_kVN+qYJvTF*i{!D9ovl*^xF%2;E-{j zhAy9ACK6x3o5n-k+Fn#m2wGFX=9GLws;!NQiC&y#=`o}34;&wi6N)QtQK!)I_;|=?^dpb=d7g_Qx?jz;|r@#+sAjov7_I2?8e+1 zsy%-s+zKzq3Wd2tKfj^61Dm{Me&6fdGylD-1dhga%*|HlU$P>{*yq-sa(B$!V?&E~ zyFyn#QoV&Fe|&!^wpWF_SPFS7cD0ppkBp#ZIG*oSEJkYx$-@h#Cwyn^sdD>nWFg%Z zi@$$+xnz5siUawHMX2e?5BVssXG}jUcx8RN51QVMAP4JqZkOK-?c2z0w723(yEJ8^ z*OzAPp)A3`k+)Glf8Tkg7*2QIYu9z1876W6+sTcw^tv8FlT&hXd6(F@ZOAC}v5&6B z6aSpyUJ4>!efV(lgiIc{MDe`H_pxpdLMRLSCHF&uLd^E- z-)@xB@_1pI{|tE+axa;f8MP@OGzHR%(^dfGlrCB%StgyTw_>HQXsIv>KfCd}U)v@6 zPdRx^KJPTIf77Ol*ZN&^2S46+-l42KMeW){iR?xSXHj9pmlIp2x|a~_tcMJ!_1$e; z@@hP@ukwSM){NE??020CO3|mFXV7s&p7_%m7Itd$4?+p0Ekgp~tL%mX;l`2@_RDm> zPfr{VQ_1tv&oy>(YI?3~=a_M7l$VJKxd`Sgs{YQ=%pyhqd2U^Xn24D!(;oi^r~nd2 zoD(;7{zeOuB;F0ad`k!%%StM)6n=vwH0aW;!`QB@bl9wUTkWh%DVj3ZiUu2zkN^gjRrGBi^ncUN162>BF9Odq<|{3K&9en4yR0D1 zQN#K*wGxNqp&mbWrt$d?gZ@QPF9y+UaoDW0vWxts)0TdMV$_yFbR!_&&t{}c9a);o z2N4|aJMO0isDwq^jtJ+^y%%1;wl~6e*o$|*<*aRMej5rM3H*XkD1p*4d+Ul--@i#- zP?eJ)d$vZuzv3N~xR5dI_5S)aAppZ?1ufM8v|_w<t_J z$>P5ogRWOGy)IDaOSX3kj8(cWDeRl(^mwfQ>z94cn~$1eTXDIkqi*HlUDuza*&JmR z@EMag#-5-8RNoetDH6ulUyVInHib_9Mtr+>vE*5%lQ#P%y6LYeEkeEXL-p+sg-hS( zWk<)3R6&pTAB-W$GQKqsQ5GI=ecAhkeD^`0b-cEkzRaB$U;EH%|0T}!MUHvyw1<7{ zr`D+~=m+uH zLq^}DIYjvuq%vxKK%>F2^-J|arKjrT`bLrC$l?>^6T`(4tmJzNZ5`<1hEq{(k#3`% z&k*gJ{A9fgFd*#>PL4-s90KQ$msXAqB&k1|lnXm-jGz|K&LoWTGge+x@V%M{TG4K`(Vtfp#DlYh+x+DuFX7P-Fee>b^W-`d(Vhy5>L=#y$?CTzY zxbD~RljDqEf z6X!0uOP+rM3#t#9D5L0^UkPwz-{Y1n);0-7d(8v}{9ch8Fd2v>N}tkcqnhS{y=R@f ztL7?Wy_fl^`Wj25yKJLuZU?=Ff7DV>Ud#`U8l*P84@M}oV7<)ayY|$`^o(Vs^`_yNU1qPV(gT(EJE+7i zx^38ccyqHTFl2gW4+l%u~?VfdtN9^{2=IR6(lC$cX=q62l5wfm7%usF!HSDdtjuHvMx zhHJC3MX<$U4*|lf%V_Y=1-?ZeQUK0Wxx{eSKb^U!QAGbU-5X?FWt&|;Vg6wpBzj5! zd-)H*(T@ab96pD9b$W_naUKS27zy_jUj0&%E1|Rfb`9>Wyf_9DU6#hL9BC)NnT@M=`uitEhE&^3gb&cmv@AkQ zj0|33_zz#JSL8oiNY<)==c>%`bA#M@{klv#avs!qq-gr`8_~>DbU_#T5%l$xA7`lc zGmi8d&7c&faTs<^``dwDQ35&nh&cfAs>QIF_BP2UhS2YrNi7DQrvmvZZlNFS>wR9X2g@hqgDtPkcO$PHbh1B}bw2le)0cJu5M^|6Vp)RV4QkS^@-*q#W*gvD;U1PJH*)#O+HG3zclyXyI0UY=V4=qXVA*skwEJ;-Qfr8@s=)#l*@eB@Pbtd2q7H zs}Il@^&28R@mQv!C8rZG#?(5?u>WyfPHnbj}j zCQdqaK5@3#&joYm&TxeUq#N|{FoZF z{ZNcc%Pk1Gt+sR)fujM3*!z6v^40Dk1Isd>{E_qNqyu=ChbKS&Ett`Hsw-;jM3Ibv z86tzLc~y_IezOp-4px2wAXVhy;AMya(u+I|$-61r!A6ZZ31OOg(uJ-#(V*pAHDetM zbNhWsU?d9U&^noLac_3JjTHeRq1Ia6-YfztE1#J?#MLNM?a!so7sqX%!Q?bsNdt-F zWgtmR(+LgAg45sJ?V_Wpc)Auu^z0P5#IP zz;IY55RGJ=3@LaXOIMgOcY)wm>#)+J*|n0E=5h5CBDfR04Dzhd@MCe)=yxrK-#FSm zfhTR9G0oppL!>cnDHjRlD)Z&4`5ysdiRpjh@=m4wDBB1AsFzVnfTu?Mup+STE6vXW zc_*<-o>}+8oRtX_07BI#!ssqQN|NZeHJ%NDhioAVcdZwNMm-0W&9gmDW&csVVa#bAx#k5m8BC*p{KyHk?eTKuSEFp2ABD zAOR2nV^ZxZ=+2Nw*LLmvltv7`xQ_-egBL1jUUr*RNN>5Y-MFb)3--C@R8(3e3oPTjeO`r+~1sQUbeW3r5x8FQ!Q@Hgndn4=< z8De&e{gP7L|IWe7etH4qmE_fFxYAK@`n#mtFU~rwE`(%%H&3<`CblD0L8QmO(Ld-u zJI}24@@<^+zJlKv((#UkeEqhugCh%7Ad+w2Y@`}LjS@FZxqtL_FIT94T{N>yvk92Z*V^6 zk~y|8DVfBwWoIoxu|T||g+Xku;Cy8J@#>dN{znh)|mK;t4DFA?^= z8Hk}dVlLCqGZP6h5G}sQ5#>WJfJ32Zt}DUH*Cp_p`>yobsDm6+B0Uee9#cQKSXeO4 z7XngS>-*Ui9P*!fUR|H~DZOF%5&J&E^T*9UcUo=gI&}8=rh+5ygBB&enL539p<;MV zj1U7>K?G>FW{c9`>*LM)xh(^HvM;aRY{?Z6k>{R#%^bx|`?y&W(>cYSRan2Y=(DW{ z6GLjp#i-WvvGkL(=MiQ4(+-s{@*8 z$!F%76%oNVU;;N_;x?0Y<|Ar6Im$?%@93!`EZfSTzt50uZ?MSJ6Nkv;EEMuPpQ3v> zru=KDX zk5$t?!=40K)!AxKl?p8^?wo6vV~-`mzQOfPgGZ94@A^BYEy^5bG=yoA!aG1Oz(l0$ z+ucnyX?!3;Nd?EGic5jM++a`!0-+kcsoH0l(6J89%e}|OVh9AMzf?^Bxa)}k#1T|) z?!_LzHlN>A;)(hB{*5EZ1B}JzvN?D;xa2Rh?aY4Iz@DRN?oH2XOlZ`#W3*m|xRjr$ zUZf)Hrfu6tHInQ$F%%4e@C*H#v0F}2`S;FJJediU6+YY0C(q7>d^QsYHIqta zf$`>wQg6x2fHSrG`LEloKfXzhzK+M=aVQ1_MDm8{b8;KQsrM2Hdw-{=qr1^8U1Og3 zga6nxI^TVw4}5fmhlID(l+>UCa2jcHPvm-OiY?d+04G^sZfdI5>~YLSm8F zT1qGaU1B6pBX-N(N&z`0JXnAMH|CUxXwZVgQ?74YkoNzL?A#-<>0 zBJT+CVDN3i?Tv#j5UjQY4gIL#3j@yCA4^uQEsXrXyS?N_rYj1aw<(s;92 zx#jA*UQM9WW%BA9eYpR(<5IhcHcTx2TxFBYn9ctWxwaoX zxJ_CuJVbY&$sCzI7W~wr@vAe0oMdxt8 znsqV5dKa?-h<(cKTW>{bcvJRK!3jbtKPL-i!+iAgucPNS)d(YB-Sr)R=DQAgisRsK zxBJUBb@nOWIeKFkos@ha*RB_7@H7yIZSHlVT%rg~&zwnO0I2XjuMG6rApA0xP}lU< zE&1e$x~K1-;|6&5974C%`Lz2B0hOWg%YPbf9r)96fFuT72sn{$5&U|3)c}V*kSEIryX2ZX-#IxlU}!PH*3pbY3c<- zX6l5ug6PFdz6>)jl)KKMRc+JRdOvCM9eS9w2oH{d=bOT6uXme2%6#)aS~$K2Z&L=# zvG7nYAU-&SP3|(6cUrEHt{=htNt7xp=pOyO7C(zfMnMu4k%ts{7cM07&p%zj?k3Rf zR&(v)8i|>>@Y`khpu~64M;k5}LDtp+x4MYHn_s1@OReFmwiqO#tkXSz(Og%r!o{zX zY6k>X;d>~@2b$NIuBc>%{-MJAavgd;rln*e6))}antIUi@iTW3Y{1WB1a@Q^er-&IBa%%k>dbYwJx4Je2Fc^nn+Z_uEWypoKAzcst9D~KYWyT9XHFswCUn% zVV}g!tr_Mb#5l3ZOcA%Vd=!M+7keN|6myvwA{tG9h>`}@xI1f;&%S**vg9BxfU)#V z54XQe=w!KPTm(ozD)yX(i5;=byh52r*KNX6E@qq(8z%@G_`%btbg7~=sda6O=(&-J zydut;1mdH_vsZ+owNHCWeSl+vf1pTwnvd8RDJow;mV>lnH-JcDYFX8e)71=f-v+WR zlN+M`h3y{+;eLF%+Th*BcZpMyg(#SeHAY2Ofy7!#6Gl%^b>SX|AWwNU;c5VFs}Yr%G9{hVpE-y z{pbDL0%qq$gjQa6la+d zWtVk$6jfs~+GwINA_m>fy?tX?(Pnre<1?qL5u<>A@vIU7;OA&;!Ij9gh*NLv1;42^ zS;I&%#FK|ze#PA0BOz(0UyhnES3K@HL&|Vdiu5dO-E!s(N{#(z#~NI21=sj5l?9$d z5F@`S-}2RN=ElV)f{hxTR8j<1YtM>gs<@Wtgc=rJ8bs2zb{fK=PRY2A!D`B1$7y6rvrzG)RVHa}dLpI&QOl#ncgCPgvi?8a*_Lt7n`1@r4LYwG&R1Sl2Zj^Tso-vePOD?zjj zk5XTCfj>e_0XjwU<})e{ayaNE?*5e9BYfgDTgX2247srP&jZk(QRWHpH;l;$+##|0 zaeXfc|stXpqz?<`d;M)Kc`O~mLD|_&Pj|R*A~I-KPv+I zy?0-}dB$xg1PexV*pWCrrD`n?DH*{}wBc*VtC$EJm2WEb9&H+n)&R}(OrCEfT^zkH zgY7yro!0gV6ymf>jn60Bhol0Oix*^z{7LxYbCzP3_u=0->!}whyxFL2N8^40h0e2V zCvh0&L&VW4_mah985IV1uaJHM^F11``;P{s#^XQJ!kHChLEXC48jhHI@l0wvNcchNu_W5q=nO$fCn z$h%>n=>gFP!h*SQJ$JddM4UW98KAuOEsLtMQ^Fnoo$*JQ?J6#>`Ah%6%Hino!Aa;( zl-I>W2G=ABfrJhQDTs03`CFk;Y1y$X-1y*)9s?2$j@&oZB)m@#JlD3vJ)vCFP_6?o zE_E1|(;MsZIC!@1?p~u?hZ>F_uv*d_A_i)qI7sWaX7~G~JSitnn1Bwf&X$G z)uH_wKguK?;R#4M$j-?_z#nBHzMvrLQ3J!^d(t;RIS6rJ^i;lu0h!#|47R13ZF$b{ zHDFI&Vs7HNd<%s-ca^s4j~A5o-vCfFr8js22)tVQYdj-vJ#6onFKjID+gu|(z^SqM z-ray6Fj^=$g*|IpAoHaJj3(fH`FP08`6ca*%;_T93KIFD#vy|L+3TK`yK(9_jep8r zfTbV|0BYmnls%eSZtRoME?Yp-12!k&8%I$xqP(D+IO%2;$$K>s@37I`;R2x* zdelU#33l)!x%GjHj@Aq@rB-S)Hp0@=I3{@d}f`o_K%Q$0iz$hl8clbBu-i^RY-TL-MZ3?X-TKKBWJ=lfzSDC}Bs z2ev;0^8P=L&N`~e|8L_PqZ{e&MpC*v1q77tl5S}>I;FcyLK>8o?Tbh^NC?u6(vo9) z_Iu9G{@c#E&-UKuzIT7F*YyGk)FB)CrI~EedvJAL$m^;NfEl{+WU#E>#JVQl2n0^+d-yf)piRbRF`{eY6~GT)8M9O2odKJ<6~rl>3v zLc8GI67?>+%rn2_Gpr0BQx7ep!MGRS5?@u2zI(;|4?Dh=;n@$SjG_;4gOnOE@RXeK z51)jWHlHL;ACgY3Gwaxu0$U=jA)(n3MG~Ug+ovK~vRrfWIO{;G+0gtvv0Y$VVT{a6U#piji#SXI zk(99T(g3`A8BC@R@X#1IeEh;k5&bsXk18O_coiaXgGxCQ%G5krb0Vd)>tY@aZ|yCr#m|PGp?v8=p7hZ0`~LxLOsK52N4P(=)PKg~emU%jiCOw)jHNcs%5 zaLG-FcdOrB%02(h=D(x5`~Ld16lh6Hx6WhBuX$kDH{_tfDqgh0Dkq}r-W*Lq7f?uN z<{-O^g=-^}rLOz*4qi~pO=N*@qUGz%V`z!w^De_Y;{GI_^&yVB=2cp-w1)XHlt2cf zR0N*ygz=N0p}lmMKeV|E^-ly2MqjVG`PJV-BVAjR5|=8u{(BcmkCGda1*vo!fb?s{ zLkCL_N#J0h3|cOA1bbtM>AnAdeHDc2^T_P81&P!AC5*i<;IxsyV%oY#`1|M7#-b2AzRs!P`E8OTsIQj)dX zv{#$tkjm=f%e0U;W@B1**&!-UT}n~+1_nA&Dw5NvfWE%8DxF;x#A@buMin-#rN@_L z1bL;gA!%6nhlzeBMyfZ~(D8us#vt+jND!L-WHT&ZgjL++9Zdv9|;bTMN26Py^_m*y(<>Q^OAsF_L+-2BbZU zqz;SSNrTXiBE}MnF&G5z`*>FmA)gr^?DdNY8Mz^i>9J|li0;kQu?L}psx;G&eCY2L zKOiZ1O=^0VMCb)seagk-B#*y=WcDXst|advN`>t1NzSEFA<`(AnPkU8FA z(E(ZL?%KN*EBo|0+7sz<&G9L+OYK|B0!Xd~exp6pH<_hvg-&Pe{xb13@Iux#QXQIA z)~6#k^rh4jB@#>xQr%9Kl^>b^&z_?UR^;p|+#;W6_=MV`;ocIPJ5MdXoC%U>P|Kp5 z4Zph>q06VV`{qhkq%a+3aWP1_Od*U?t%UrpxL9`F@`fGc=J#xs7^tUSTa}I)n*c?a z@Rz9^i+wPEp?pQu=_4>plouRVf>PcF9a>B26nUIQ2_xO2aK2q~M6>xQv?_G|baCNa zIQHZmCNEoqjnqT_N{0Y~%9ZVFP=XpLWEsX|Xj}uRC^pwT8e;sl5{rpCd5T}qdaKvu zI@=-s9+@cJJ-~jED5028ARMzFa3=VZ`2M_09@N7sB918oLIB}td=sO-(!3eoY#rA) zuQ3f#kbztn>Ki7YHm7b+6wl1Q$N;{_Uk65DL|`%a-2KfqJ`!MaBD~V*U=l&;fEZzJ zxlp&T@`HGmbX$e^RD{$8Hmcm|>3e3V?&|u`cuhTr`sf`jWH@6SiJw*0V$%CwFba*RdT3{;YylJh z7&Fke; zA^qOTpZLyBTG{it*CV7J9yn2@k;R?GoEj`=c7qpP>EJ448H_a7jpqBI^7%myjo^gX zM~skTfchVky)wM_hluC0*T;Uvf~ReR(>Xp)@JHcunJUs1^n;ZOG5dZOKr+m7i(!s; zZve~Ju~b^g!{f!Z8q^KZ^JDPM(DLV9V&C#_yxB77o^9ONhOn3O@?%YgEt1)%RPQ1~ zQ%K@bI3~Lby=bNILnsq{(w&BjB5Ktx8YnU)nt(>~`o8vUNb%zw}Lv1Xw3VvQrNZ>*}NN)etN2CLPTYuvCH!k%mLbHf>6=h^Ry-QU^v z!yG*XR#+Xl>+D5%C^7uG;D=WuqQpmdtJQrvum9ac$49|R(=-2$hSXB*47Q#nOfj<3{Gk;(3UJV$|bcJQKG`t)>#`K{|fwecCAx55 zL~LPSPV|`6YwX97O_}iJ9-)RW3L8eX4T^X52w@~93N`|d zMwvH?;FAZ%7SK)-%_+H$ksqnlur+6K4G6*(*Ye3h+ zIG(>c%g%5^MFVlUeLXGo=WlQaH7|*I57D0f|>*>@_j#hVtS{7mN;Ah_75!I1+Rhc3aW39%AA|{ z>+90xpc=GN9&qzY^A4uFrF!gqiZq`>oBvQOQ8B=qpU{0!c!Q1oZ?hD@3DD&2*6o@! zTg%P>qDa{8&_cdoB!t7LuReJ8m_!43* zcbY+}J8#Sc7Tx=f!ob)0rHQxI?QyC}C&UVrV=y7LX|t{I1ZNTBsHgmjLa9F zGg^m}a2~t-42+I-C=(OXd2b7>=hVe%7w)zgiPKZ$!UbulgyqU74q;N|ry4)V^X_tg z#3)z?lmo=3{^`**|k z!$wkmgDI7Aear{!dNMHq;SzU0rtxi*>B=QBW`oaaIRpsQn&L9>wx+ou)8k{3T@B%C zfO;-pHmW~%B~eIi_x1F8&L6*VKx*Z2w|>LOnqukZr?aTf>Gc;Xnh|_bh2>Yy0|yF6 zUcPkA6Gm~s2l<(*F0m^9n>UjKc2UlgG#3w&%xztYHp6{k!}TAMcV32GiLknGVx!j> zX^FD(mN@|7m{Z#gu+XUs`AVJaLGg^RZ1a&%WYf-EZKFW4DbFsL(v z25TPFH8@mD?M($o(HhWAh}#u{-4vYyhXz^zBv1N7J;yUqe4g5!C*~&;{TZZ5L^~=H zNt#If&{DEjh6S>=?=+V~U2pV~F3d(rGU#zyzT#tdU(+b+wjF8#S8gZUgVf-B-*P0> zz-ITR4M7)bsfpjIs$$}JK_BttUtWw4kxu0I7qG!5GdE-5lN#GnFLpc*F*B3$EB^cZ zL!-vQRsBrf$)Fn3U8nJ-f=qEnFMSg7V{*%vu!te=#aK<3=o$|LzV1P99-i%Lfi{fA zxsuG(9tF?hIF*cB@(OnadZo$gY?lSPUJe|E zXP-n10$u~x83n6S5fUL=T4EpH8Hovt0G2i>AxUW#zAX8PFvDNzV?>L6nwN^8zAk5FN3)ZWbD z^`@&^Mj6FP7*rc4)yoR~*$GiQ|CYyzi%r=can2Qhv(|T3yB-|@krU-Fri#?!$hK=h zrs3fLnna5^DJ0YZETm;siyW*Qb+An`P2FI-eWLp_1q}@DM=s!lv(R42p^P++Ud5y47n-sG?y@WglNeU1*U@ zoJf6jyD3ThxUB2PMU4m|%;&p@F&fZ* zDCIf&CgPSyDY)_MR?{Ni>v0_m{!F4lqVp703~f?C^CYZ!bcr^%j`T)9 zzkuZPmOXZIb8)x66y6;r4mF<&+@E}b@ne%skLC*aW7gtA#$a1?;rFg*EW8)kCAII? z@U~!E-JIWD+Te|P0Q+C_DA>%(X{bMWlD^qIHNqhB;Od1))de$GUJEwUpn28tOoDF@ zSDKPZ-M#&!5n&OuSU=bIZDQ=Ak>a^;>@{7{y+ zhpq%ebVrd1!I5PdVR-$VyDY_V@89yL8`Zrn`JkdX8U91RnGjl8y^d=`oT|D*N4`RH z@7u^R(^%@nF2XElJhv>9B^6o`^IcSk^+&I8nO6Z^g_OFGYz+Q3^dHN){QVK-C5EGh z=-gD8a;>?(?@e5PWjX1^?Cnj!QZd~j}mZ=U~mU_XmsG7gcEQQi~{bcC^F6^@oa zN0j=SO)FS6)Y+`nt?M<4^OKrH3UnQq7U`kJb**Z1sfx_qHKz_?R^@@S zsWqFIG;kBiM=sL~xYD{d;z1*Q_gI*8J`szt^iUgb^VvqU+Q|GbT2x92>-Y}_b04X; zE~f#(ntn==Tk%GR>d6_N3z!{pWx{DFc5G~qKz7^)SFvLfoDpgqE#KAtzD!x@m%{v^ zQDJG?zl!%RYz=|PwcV6Lv~M}+0p7lnRL&ZUpMY90iiT3~&O#>HySXl^wvk|0dM4@% zM#`N&jCLTq^KYLvZTbtd{y*Y;8Y$>AQ-IWRsB%Pgdn9p2lzT{x6I0HQJXiA1Iz3xG z!$ryw_)Wp8zh68w>kyCkP8yDe^U8{;xpclHr|xWJX?yq9z6JwW z%~s_t(dZb9LV3K{U?-5M`mqh?&*D!JS_(GJ{$)k-Y?8YOX^JKEGR$?7fH041em=4j z5+(i>96Gz3kw(Y7JUoJbcf~X6zGSRUyeeaMh2tE=GK>FC`AvwO)tarZf=SBF#QlDa-%_A>E$TpshX2@edU~RR3`|c|~!E>?k-km_s5mXREVmGXjfpK9^ zuRfu4qLUM(Ti;E~1PY8LAR(q{JaglZXxGfu9$^%dR&Ak`&k&`U|rmx{Yhl zNT7aRZ|M`BP?p2Hx0~Ydb_xO$HY(g;1HakM*zhVS=WqU=qq0>6H#CiIgorwL{J1cV z`HkuplVbPSX-`&JrrfZ>>{;Tu+kgK-K$hk*HDW_mhNC0F(YYUu9~MSn*q(tCTea0pN(nR(YWe^XeiRTcsTm){-bVX%jsizA|KKAdjj>) zUS%&!qoJG;98F)DXpF+TiZPESkIv_qN|LCj!}|z?#&Tn6(dsI^%Ak}QnXfCf3sCAD zX*%MN|F|s7y|q3yCJ-tpg3CJI=u$(~b|MdyvVk_qVw{?F`r+1dQ?u+ll{H~2R+##c zFOzEvCQSZt!|!OFi0*b{s4Cwk#rdzH8@QcvJ=aXNZWL+}H;^X{2nfU~XSf8Iqt(LF zG+({{hw$*wZX3#Y3;>sz9F z!5I$!xg*moF$WFoxQnv2>d!2;B-kQ~PXwn3FYB^ni68)5lIi^2l>@dn}zly9|@CB>vhAhBY)Z7(W*LD_gO!w39Z~YzY0)t4SWff18kW z1KXltT~IDJsBfU#4XL0s_ic-s!gA4jFh|_1Osp)3QpU$x={49%dyRb_V$GN@t07Q&V zO$(!GX``Q{iJ95;(r^!;+X*7)!Hsgcd^oX0vwYq?mFXbMiLeMrFoD8{>VgC*$;J=rHU ze#F1}ZnKRH)&zUN>cx~WDx-#@{B#%o(RG7t6BEtGKH?t9G6Lg2d^^TyGOc+o85C3g zPY3omv{yT|o$5S5i^1fN38PU=S+v}Ok-zs z=#brmbWfl;tj#fhM$$G!?*((=yqL`Ki@#zy2sv+Dl!RgErgbN#&}apdVdd4Nn5kWa zK96r%qU_Q9o?om_| zH2tKjw?9=cNHn=lcWpQPddq}-D`_Ai;r8`Lz%pEG)Q-ZBWuGzLw*>vi$z2uwlN z7SCF>8X0IffqEBl)NO6bLbo=`k%E& z52gDaGFCpC#t8@ex22?_fKU{{qKZAS7Z&GO>K7*;3d7_t6=S`--^zs^-SS0H;%+G4IN0IXjFL|(PW$C(? z8CUxC)-r_ta@RLYV3~ zF)zu>JM{x5RUlFo6mXi5bF@P>C@5XjCR(am$$Y+o)h4?5>=#&!-rs4g^2h*p=s`;w zCROkj`+`D?SUFWYF%ByvcC}m4|M3XxR&;7SdH~KR1JIl5O z$5x-RVSw5EW`L*d$Ql$I=>=%~M-M!ONuz`T$^z(P5TTwA8hsLzIIm7Gf%l~M4wUWH z#j`>!Ai`>Kp(+4902M@s1pfGWv{zB|z6~aFTyvK`{{!a&gzBt%TPeQ}fk>K0g?NKP z1Vh=jt1LA#uJZ4?(Pw@|(zY9pY{Sh_vHhy1e#uRFZHkzPKbf;3VW16wmzMb2LGirk znEhpjb(*(%5Md(-3`Anc(IQK_b;e>SPUQ{Q_%6YE$3zk(wps^VgXgUQOMxYDYeU2# z9k$?<2%>`O0zz^`-HOwXfj}ErY0Bg#ir2de`|>u8rDg&1m8CT*R<3Tdd=$ukF2O1M z(GRDGs=Qd@D<+Bq0J?=RYp5Y+yc?A*W1xDc9#`jkhjlz*q+yeDo3+O}M^E7xh-x@z z?>Diz_ZHbRzALwIaTHKOwAHtPTQKp}ngEJ1ZI&L=UkE3?z`RFA;$IIHqfJ7IP`I3E1;jxE(_V!s2*r2H$_hXsEQ-3jW4K&`*fY~<@UtGG4l zhW-&4=0z%?wF*%DLiw*Cm`hGyYjdIPLI5?cmX-252cq{7_PTiA^=MGYnHSCBv}%JMDNL4We>TEBcDqk9F#`I!iNY{Y1`*Tt zQZ?{L(hwcoCgm0Utxq>tIR*HM-JbNrgk_P)cb9$&C2r$gkj%B()03OKme24zO>p#m z5kCC|Hqv-_bov&GB?qMasU3?o6z)+q$t^V<5{9t>I*)9jiK;_ejr4ZoKSy0A;ifA? z8*dOX02@+8gQa!zn{RS+;ZJYAWlPyM_i?;iHXE|czo!pU2L+*2FP)a}lJ$-UzEZ&> z&=F{IpP$a}3Y448FtB}7*sjAVp`_Lez+n$rHU z8NWyvnrRP54+9MRvee=M*YD9yd45^_y$|@qi?V$fD-q=|nuD1N(g;P$HY2-~P^)M` z%p^59Xxb@})FvMOWP%IML5glZwRUFNUm^d8tDy{MOFmRuzqvC8&jWimH$^cpr1XVe z2U;fy$&S(YQ=W4+49M4<(Y=g>8rR=hBf+?$Bz9Vp6}w^HGJbe;H6^qc|jGSlWce5u=gKLc~# z!fQ2DmDhLr`g^vw@6R)b{7BxVwK-;p4sB@yupK^v{n!KyRydM!44mHm6@*WG7>@^{?`%o6ydriArF&3&fhh0!aLh^ zEb>G;pPPW^kwiZ)f~yf6^FG0&^J&qs8wxmm$6SL!+0od~~)tE;Jv2`Rp!GICq!zW{uIvM=m{a?IRE8Kyb z-As#A+lGEd_FAltsjJ`Hjx$}6uQ@IVgY6YY2i|re1oGxKF*+d-(NC(>G!csU&uEP~ zVMO~#xrcL$>EMBWZN?oMMBfY5Ny=%hSBt1f>n;)-;}1w$VbtP|r)6}M`wcCh{0FrG znSJ(cP+YOdeB25PCK!W|zY2=st{-u`_p6zKb(Yb1G%X8RoU;S6l5l4D(je<)^&b*c zYTHcAU$QIl=ibsFE1GNj!}3m zZ^QIxJteBV#BvHrYV&b*nAH@l^kB`3r+xuefNSIAFX*6K&c?%m?ykobZ1V96I-V~H z1ioQkfHPXwxy`;sUF`y2&Y}w(neG25dwFZ=(&Dxc;UK;U8Dh7}UvT*AOtfNus3Nr! z9HE+(ckr^KP?%Ug2r7XTJh24`d3Bp-oL;{8Z|j>9Ac|y0Jjm@xm;h2LyJFjSmvBgr zq=BKe;`5D`$YSC&&wM39$G*zAiSNJhQ01s0%18V4^QSc z-IXN?qR@KqE^8`^Zg`OY!CU2Aop0OZ>Hd}vt6ceUL<3p5PPFDocNDVRQVHc>W}OzK z&rb(St6opC->7FJRZsMzdA^RkgKF=O%t5jFu6>vCA8J{Tze3}Xe*HdvnZ|wQ1RiBg zmCF~K_35n4mAv0Zbn6?A%51oUQUUH99%BIb98LTf2z%U9qG@d#Z%iTaY_HT$9&(HnvBPbI*rZDlcl zwZ&yb75KkEKr!%lUv9+m$5t3`S`G1*xLLWMSUpCbso)dQZa?c}@CBtz==#88@LDv^ zyXX_LauM0ypw$sU|D>P!`f;BuM&C>|Y%sQ$I7ZQvfEc>up@lJ!oD`Q_JJw+EsqmV4 z$j0Zk-7j0Rm_M6}W*JfU%U<&+s1IXVT!)n;i-X*qku%NVe!BPtnW?!kLuH%k2%Mfv zCOhG;s{9t$E(!3VN!71oNLm!}^Q79CQ~v$VdPZJ9rZ$0@+um-OIehAxV8iRxe~D@} z^2>Z1dtL@+%#APRobnz-A8GZ$wQN!S=?jSEu#IQj5%e{pW zPdek|+RS$?Q~itoP)Q0TKfOx_rKbwm@~pOAo4@lr$KC0rpOVxsrXMl1V|teP*3JT} zdb5me27WX(2=$=ndb6tQPVbCC73u0sR5G3b?E5lXV{U@0DPVwsIG^f$tC z5d3!xf%Et=_T5Kqa1sLYQ(u~>;A7Yf`ONaC!pDUx##V0D%&dNSMKIf6lW+m$2>Zt- zYST2(^A~uGcn%mTA1kagm4~EV@S=|;e|XY-(ClA$Egk_Cy}U!l!H+W55t|E}8lK(c z6o^R#Edo&tAAJHnS3G}RtwR4P5e`(xVdFIaLccdQO=K1omkH}`Cf?!i59qX$Mn4`*0UwukzqeBI>G zDBj;!57eUwYkn~KL)th`ko|-BqfZXP>PjyM+#rb}D|nD%C(JMR&b5GC$1My$H6yel z^dxx^I5lC)pr^ZkvJQtM`c?TpJ>wZ$|jh{ai_>1NA_vXAdZ=S zw4VNdpGLwc*o3EedTxTrW`($oyQ8=VPWMI*k*};vCnVEuUmp<#B{zJMa7RuzhoHjHuwnicjv!a>%)1DtgdS`M?`V?v%>2|RCiOq;m5>^ zXO?Ml9HGjgD23<#g&@+#ehmFO^!=dx;5Aw?qEN`MPWC>j#rK0Pnu$$Dv(etlp-iUx z;%PGaDqpe;9!!F?nzJ#hIOWfqd9MOcE`+@Fv1u%5CVZ(tjY{dRN=ObmYM4dca1$z% z9R51WgU|#>u|H}XPhJYz^7rRrF5|Y*Jek~*#4$b$=pf_&v{^#=<)8SdyfL#B-71rD zC78m*E~7&cLyGmfc^j$>b);}9p(0ulZ<}Dw$eTas(Mg8U36cFV>?9Ev-+|hf{?yPl z-IHQiWw9+wSKT4EDGf)XEavZ;pmxcdmq+Ou0zWDJ%HvP_mO)VY79jwdkoiY59>(T1 zG&7nu29;%egMKSfdCG&(1NdDb;`}B4_%IlJ`0-Q(<7sA^+E3Zc@ zWs?P)Mxkug7c(Nz-0_&CXSe-Y%t-(qOlFx#?1ZjuX797GqQY~*VZCMib}b^Ivf&Mn8-9oC|@Kib$%OsV&(=1b^rM7o*AM+yL+LT@`v)`v~f!))jZY(eufN3jE zPUO8PMFipVdus0^C}17y4DEkth*Edgj3RSi-Z8|@r-w)e$N+KHHZ5%-oG0O7Ee(MdF?3(O&bd z**w_4thow|)2&Ux4|=V2&L4_@&8ziQ^a{!Bo#v6s-Phj<(cHpn1s4#i!1yk(gYefs z0e(gT&6}1b-q%_*y}IcqkU=|ZofrkKNQRRAmRz}>xVHvTj3J+3{ZxR!*&mWO@p-Ed z;(UxVl+qyd%QkbZtw%=j?~_MRCnXL$_!fsGg7mpBCawha0m48L zfcWKcq`h1Vp!fR#`v_7PQxegFb{82+Hc;~=p$W7Ju!!6iJ(;omP%p_?iGq+Q^oJsI z^Z+^_lt-5C`96E3xE69QQ~O3sh3x(sJqW+CRjJUIYUSUfIp_cq3X|OuYek7*neaM$ z5YH3$i~tds?9L`jI0U$+gDU?>aVPh_X#t?X6nDR_>nlzf*PPtcx{ujqeLKU-H+56P!I=I^JxeWi@6YEH$!x%D58G z|3oGJk#j zWdn-Fs{`qE@MVc#vXBE0)n&Mh&&WqQhe5w$Pqm#2956+ZT>0Ul;?JNU-Gep6)GRGQ zP*hjgx@YRIYuXDslkCLT+)eRgNT(bGcQ67+d{QN-fO?EZ-0{_O3j~Ssjx_8bq)T54 zaW6UQoQ$PiGtgZILdYd+g>BTlixDb;)$q+jgnac?$^yjgt@9Tm6IK8M6r{M!>A=y4 zH91>$V!Y1?L%T?LLgp_`m@OWx{yX}!n)VrVw1?u1c=tyP6lw+uK6rd+xIzIxbSUvL zaJ>c*>)ui7tW+_Qzxc%W1`I|mj2(PY(%i0>D=7_}jc*_172zoM1x9v66@JkiH1jgo zXdHF@j0{YIu}O0k6ZmS@r`!Rzp$d0M7(7%vbxpMSfqac9oE{H!6c$dn2gJ1=Z|Gl;;Xq#cp zl@ktuFjKo+`MonJr)kBP!HBR$7dq9$Pm;vM;nPUXMbQ}>Iq0rh+}Nuk;lrn&LbQ-P zO^2j5Ie)Q$U9ef`)B}m*W8M)&;-T!|PiOwg?`ngRwE(|7RA_4=)dm2705nw;jabpJ z6MklqabC=BCqCMc-h2apdXi)Rers4?-x+M4DE?PV9x24w?V)wUj%E5xUi78%N1>SGYcc_3j22^%H|q!F7cqT<(<(6&a-P1pp3& zsBx#gC-0tl-=&B-T)cDG9^%onU8i4;g^Iu<0Rs8(R^8TFDA*Q8Ef`B;jUvU6ctKNJ zfl3E8YSl7K5W2k2s|zUFD{x+3kBIBG<#*l0UaVo0$}7~u4uyb$dIjHTId7cE<`a0r zCyOxR`T~;&b`47}i)yeMEiD7a-Uq``b}EtJ3&^=Kt(>lxOnRX|5F)xIhg#FTGHp1~ zeS3b0?>(BM&XV=ln~}yhzwN0zn=K#~BqkSN%pgM4!DC%$@Vp=HNEdsRP20zx9l=%MtNZHDI=lqv&?!cM6-KGgamVz(7a3}#>>-ihIvxRc|tXYVN=SIf7*VQ%eh5l>{M;GC% zUeJ$nn*>n6$@qUg8l>N_1$Ap$}^L;(9XZ0NdN^{ z>>yd=i0-J}sb(5^#hfvBzo>vq#=e6AdQ^xXzVRl%j#Z3>IoIx{^5ZLyQn)VN zqtx+}_*duR^@fZ9oDm;0t=mz+5ZUNogm9DkM;am_0Lf{7{I85w1<)g}w{gu`5h>`i ztuDyhW+n}khBA9E!{`UXlny$bpvb%^A$U3P#13@gT*uBrb$ z3BWvTUO;FQ`g+XNQ3(l46+w2v^!iVTOTQLacW*6;OWJ ze+LRdPhN1OI6YMkxFf>e*ZVeRrWP=ax==u(AqZp~4usQJqbrbP+&gi?f65=ip(c3NU&M(cv3iiP@HR}>pJlXmnR>x<2r<(J&UW!(}Zk3_Qq z@CvgH5u#(@$D+9;T^@mfaDlBdSh%JEH^FDEPhz*0Sa_dEnz9;AT?=ipMUv(bX&?lW=v7%LcQ@BKG3BBoTzkO5if)DmVgvzXj22-d z+GOz3&qMLoxBcsS8ND3W)t1-$CdDEpL5Y|};%99JKhxo=OjuY&Fd6SW_GS`zSEZ!afE7QK2KP3#My-i zMq?7L7(8)}|L2P=gf#nu_=Bm_flhK3Ca_1x&dfo|YxI9u_hkq7p)+%ex92)qhO6m= zJl1;6yR?NjM&$VU80}(x=DG-F@7{TyuU+WhK6&IHa83_{2pKy#lP?$ z4iL8eNwx;o}JV?JTDsuAL#_c+jI8 z5B)@(+m!%Q-1K#}jw1BB)QOt49*BLU*Ui^1BHAT6>aT%B3)SIZJ8x0UE@%@v+oZ#j zR({M={|oh%t0p?i{+qkX^kb&RqSKW)I{_{h>Ec4ndkq7M zGu4v5`zm@f>7bPvyi1~&L_p5PglzN_fa?d#de`;d zBmQQ_F!D1CAQ=3ytM{eQ0+}izXuj}Voa%bZ|jLr$7g;&9P2q8K& z$sp~j@oAc_Hrv5m$Uf;tEb3NlNk^Cz)ilh`ai@sWyj zM!n?Yp-Z?(=kL88(0d@~UU{6RsEx?{e&Hx6@=xVjMt;|VZbyx5HFzmd-ftt)Ax2`( zTT0?}aI)Kx-sO5PdzzMF0%mnU4^8*>p0m#Ml-SaDpGUi!wYWK*U`#kq+@+x7_Ti&_)Ek9+H->+SW<$tX?Y#d|b z+VA4_SMu67X=8_lF}~AS3Hh-FxobVrz}-SqWX+S0IL^P(QB+|R))~KLVNWs5>3uIz_wW#{^JV6QBp1K5dXPg@jb8LJ zWO3)m6=yBu{;xqdQU4LgW+Y^-d6uZn^`MyWIIWwp6TEb7t5Mz7I+pg;rCQWe?OT*I zs*;tZ)d{F%LTOq*Rb_fbMtk9EPZw?4vDEj|WQ`Nu7iDs4vY?yP(X zqO@`xR||}+BK8%Mq+_BhLR4iO5Q-}i{-0?o@FR$P%7dkoAWDd}FyG}_M3$(X=|lVm zCIThAIuZ5f3_G9t)vUGQ#b&xE`F|Xpby$<%!-qEpOuD;Ux=T8w>q~dHlpq}&(xP;? zqzKXyiZBW3Mg&2+QMwu1`~2Sjx9d6ExvsObb3XUyu7VYPYY7yVP%gR_+jUw2+6@L%7?AUzg4m$ihMi zN*X+r4yirEYC@Ca734kGy(uNnz`*Vcgp_?*Jl4Q`9w2>6V5H*QZkO9FH*h2L(!1$l z;#S-TWd8?)h4&9F${bv&PUos6@AVOvV;A4KRK)I`7k@RyLRrQDs*ZIdlg8%XXsdY` zexehTq#`>H#fH}l(zCy;Y57ju%DVXO`-Qgu@TajG=fA8NZ?z@-l$esm9BX?n4F1E} zk=I)Mz}=Y^V$Yu+xCJZTaC@5KrkGNIK_^rIPNjd`RUD8vEf95dd08?1O@#0kz4!y;suT<*(c*1 z2~!comUZg4wP#ZGSk-DIZ(QbW@GApp{a(NlBlxuQ&4ii7=IioR+yZ4ZXhd#07+y)$ zc$|_v5I*0w@}+S5ye%HrZLY{!*1r55nrSDxm9O556U+$|1)NI~cUTr$VmxvD*zl3b z#Xe(%YAI$FL&shGB0T`Y!e3lqA{gHWRuM;hoTWJxP78>v_+D9AoEpf9A+atdu65(Kv?~>AM?8hi-4zeDtdh==#8* zig;&Dk^8NECKViF=al)dAYl>Zv)o{bQM~Srn9SJRJ z{+8HE8YOz1WwMFiO*V`ViPAmw@&D*(! z)l!E_CHxg`O91$ns=XNbAC0bt=-{{%pLld;hTPFi7&(Jy7oy5XC&~Lz(Yfk|M)cpQ z$?i#c^)+y-T6bdS&n#lsFs?0l*hc~r7yf>w6de;k1#AB4IVfVzoN#gc5Gp8Ccw)@QCbeLPKAau%%&R#uvWLVvo;?#pb zd-=mBPATDyO8$P5hjJA;buvwpj$d;bAs2pJIh>hG6wHlRnQ&T?EVfI*`yWrAIc2@z z6RQXpEjnZWQb=j2C)dSW#!rDE^V=(~JSZPdO11Ph4Z<7q1ic8@Cr*3JJHwlAaW!~$ z|LI=7J!ka&^p_5g3^%Ll*?4L*ZCdqKn~+g=#QImP_g+x~ zjn){Q`;eF@1#%jKCyq>TvXs5vCaX7IXI)LBJyjB;*^~8xGqxq8g4r028gKLO+9CZ@ zs;mqZ z1@++(i!H0p3WT8DF=!El7e3W499ZqXl87<5?8X?sME%R!u8=Rd{*a6!YIuR31-C{W zSJl4kMXv1jkm7C88x4x~l8lUzhm2@GTQ<1$C3jT=cNn2v$rDj%6em;5@`o~IM^`7s zyPul|$43U@N)neb_aeFW$~vVk$%Z)Y7@>>&OBc3$!X6xEy1R7Pgtrk>w`&xotR!FG zp9Kj74!~6c?;b)TMi)2UbVITw(Pwx}GI z;=I6<3g+XJCN2Ovk1eS)V0zp4YG|xt%L-ATMGUJk)@QtHrqm$sv$wWUAVH5p<}3B( zcNHRc;yAzf3a~f2eB}i2VPD$Rh=YaC`o+8lg-=`zMC@zp?RfWzC-es#PCzl{I2!IC z9W!y`i5)zFtZ)Vzfvm)Q)61g(>`s?OyyAa`&?zawxd>0P-w3MpSUs2s@x9TAz8M(N zeOcmlo0dU&GhOtL6uVYMdR?PxL$E{K~B-GA8Z-^VkuTRzqedw}$!C4g*z_d@i5`2U8QD(Af4NfZly2kF(O8fCqV5@< zd|jX0qY9sI<;Xg_CM%I+2EW5=&jG{MM)V!v%f=zYC~kHB&r7?qJq;tNn?w;z3rLCN zes9Y8IB8;rXHDF3#$!V7F}G=eMp%3mh@W0mj!uUJqTtEp{%Mf*ZN<+QTWRz|zG|uhoroc~!zHLJ%=Y6o zW5Ud51`nUkc@jS*7z?FvTZ^-@VBM61J3q9nnxIyw+xS32V?Tg$w3dKO6vU8NLv_24 z)BI7+XTy)$QeVi=)j6q8-_N<*>2Ut6cjBNdNuwyLtt`hcZL0PR{ks42q>B&5KxB&n z=Y??Fi_21fM8(4UiKCQ=n_4cUKG`bOs~<7DUpAsuMUzg^!~0XA4y#KXzg}UEvlMQ@ zu<8Aok573C=Bg_h%TZrGgTF@pBfXdfC-9blZF*t4TjoZws!_0E7~R|X4Q8%?Z3b%9 zs$(ka20MFMqH2YF&kNf7B)@SdDPT@U4-ioPu0ok_MgD;IaQ-2tvFoYpXJ0OQ_cO>% z#bc!4wY8~|uDg()7iGpAw4-V+qIV(oQxnxN--$|%f8^*?tqw*LN0#9_4VNb4f!Hs? zgyq0Izm^qEJ}Y9O3%7o9L41W&pv9i}$zsMQj~Zhy=1yEL-x#4Szj7n|)YB|%2lrD# z`7dS|WkwfIIHaqD?9c7-Wm*XK4dH}I+mD7(V;!z?DXntBC;QXzd@Nn00OJ^ww2K6S z2XmNQgk)5Ex-|JAtp3u@4>FnmqvI?!2lQ*YUK{)j+EU~m)+hC&2oy6};H8xsyj;8} z#H|to-r%GJFDyDJwn-ae9piEqC-Z)o@LE7_k-l<6l-Nwn^APNj#xE)J!G7)jc{{|T zFtmbfU=M)yAT1(qUvIU2RqH)#73`&bbI{D}j-cbY@uGD}#$6)m#ZvCt;rpsO28o2J z-&*iUx@_WJAB2=nFCXNArMBSuMl7OX)%}!%h2R%5tXADeO@qv)TQ1@i)v^zmX&7+A zgP7P&Ide7rEsa9g@b}2I_Zs*-4W)y|Gz7yWe(SfTMtRo*3E=*=2KQJOTS5;Z+($WP z^muA~$V6KIcBD^qjY3Vg&&3L~R40Q1;u9ObCO_>j-W|ZBUo-pMZNZ4Gb&dy}`qVIT z&D~sWW_>`K@8KE#EJ4BwI7m|g*)HBHzkXv&EOJ5IE#RFSeD#w(?)^V@@LG@tNJ|S8 zqJ#fAw~vY~q2~cX+2jTiThIRdFaXkxjG<;Uuu6&Mw;v+!oWL8hJolfl$Ri^UfE0o0 zs2@3v_<0FTKw?gRH}@5U?#cdxw{O#nxlxG7ko(gfl)N(v!tGYksV|V%-mdJXe?=!H zaE6BKJvTjbGCF%)?xw#)fZP@DgJZD{QTae4@7ZxS=v689v_G~<7v4xM0AP%w83;ej z>x~bbgij#t1Z4M}c>D^L;6G8UNl79iuvS$x_#~Mzan?e7VbskgOuGd6Dr93BC(bM! zgU@EU>LipocZ~23ri0bqsi7@NjQ{VyAY9Znxw1A`6NA zx-{jEVf3VDw6_9Q2tXTWnP#lJ!AaFdcJ z;C}|p=hIL^_z6PNUSE)0nuD)?RB?7#l5MxA9%84BZ4Whv9-n!s>%Mj85m~JI;B&-h z>o&I#9XQE76Yz3ZUZ%fG`#w03@e!v9CjxrxU#_cbt+}j{`&)Y3hXT|_5V6O8!>vF( z8B5}APqYY<$VO8ksO2P!gzt;A$ns;LY0AW^q;Kj&2cg}@6#9SBRTLr;ru z1QsmZn=R+yFokX3W9mmYAoGr8&a&)u3>(3=eNuym>;hc_7fetiIPG^U+eJqJyMZ>j zVe>&!;DIR`MLA>-#N^O9Dw*#5Z##-X)YtFM6z?nFI?p9GLFq97oklbfqJcK|Eg6ri zbnd|Uf^hE3?{!OS7{2h6j|gskv#C*yE<^Gyz{m!0C2I(-Ii)=j>NBSdp=fpm`kP)5 z4CD(Z;D3oFQe4it_e^U=2D=9X27+O%nZON8Qi~kejGjh?Je^Q^0OHAl>zNa5Qz7zx z87i)xNJF`cdFVAf6_Al|t(Og;u~W%&erPE^`&?8Uj@QlqhbP}}6G`4BB~XmFD=)o< zCOc(XkC`dWMB~vP{7l<6LwuGhdc^Do_kbQa?h9Mu0!P*`R}^yZZV8b%NN@!INGgFb zHPuRLCFADGnl3=f*sa%b`>EZPZWvSu_g1w`ZBMbBGK4_7`A3Lukz#aR<1R;}F*cPb zh7h8Nr+;`3n13%QOmvC;W)lz(&EYTd>J^`hvhIET}75n2$hf-lDZQytmpX#7fdT$BgF z#Kx2xfUz(%RUv_vOR|Br0+=m_Q?>iM6SM>=tXhgln0!PMvZ*s6^K)&uhdJv#C&7#i z@30i_9o4DXmF-n*jai*r0ff?4@BcxUc)EQYd&I@dEtJJz)^RSLtu9>hd3XFXd$-va zv%$NDjg!|h$B$++W@~nt!bCxSvYG2k?}wNGZ-ye)^rN8;YOv=aFZto|9kyQ`CWwXW zl};16^Ic4ARqp?s8&sziuLMI`Qyx0s#)%DW#iqAH)~7hIM)Q!Z-$Qg~v;TWl^l(N2 z;4F~(-2K$R0V@9X5&Z-t4BCygpnc=WWsUBHW!hF5Td_N(MCTCb=8P*A8|zO2w^B{ZUyk2q#lG?$a~r4*fEB1{UHNY zy$y>YQqOK~+eHATnz_pkArNHY*71x%U~zlbt45`ls0uGyDG>UNlOf3^)J>%{81dx` z(ec)}h2l;BN~3@gY#G@46VQa%lEPKXGdB=c<3l@Y5QoB3hIVLKmaPl1dG|>ux;`?$-;|ePPAn7&0B_`I^;>fBf;!i z?(kAv!ajht<_ho0!gp8Z8ZexSyjJ_NM+9e5oqDk^*{G17vB3x| z@L&|iDo8VN9CU|MKhMwyO5p+EuO&G_a93x za~0wpa-RKd>0h{Ousc9S3Un!G^}aSu=AV6#M+5p1f@T($o%bprY5R&_iFD?asivWC1CvacHnZ`X?@4+1H zana2kV1fU0_&}D9Hr89tM@3a3BY5LdwR+oaT*#FI?Bzw!oiFa`c))8<&)6?_IS61 z_294a##wiz7(+U0zo;Wh($I8@dm^Y(@R9Q^|89Va>)M}PZ|fgu&^~Bck)RXQ?&9pA zOxe(ke5MRMQvh(N0Tj`IqC7@ey8Jx!;HD!y6rvkTNdd?4!xpa`(xHqLAGy#H(j70>aX~>D>}hoKm@VrjQmpC3ba^ zB|q@2sqUUGtEc+Max~=t=+*wvaq8u_OS1E*ndhz|DhD=ej0az{a)c+f(K(@G!*Q1u zjMo0EuD7)pLDkZl)uqY2KYMaez|FpygwF6rGvr#c+Dw)SWQ36Ibu9WuKglk2A8saV z2hD+Vj8qa*4>C2~-0|{*p+2ZQr?sn26ifYU?GE*0+A6 zS(i_tiaj%0!cJnKpwr^VfB!RaDAkXI^~FuK&t;NW)SGu!uq=iC49^`+m1ezbiWd_g z`~6k+c1mn08%d-A8-aJop;99h4t$9p=|p(n+pOUDGoXh)CT7xE;a&2w8|Y(9!`xcyszK-W_WtmBjl=9nd0zAfWHy>Qt)TYgiN$x{HG?P& ziPySkLYMHaHj;Cr6wqIVIKwCU1BXyEE1}5Mf;+>zH!n(l33_VCOuSBpy?^z9V!&!{64UYd^0^B8`~tBBUU z>u{Yb#SXb0D$1mL_VgNOhqAiYiTOoeFS>VoZx6pMN+%Wy(}!KbUpAiO{gP!OI^#v~ z2Fbqi$t%Y}k`q*}ojYobxcv8~()~x3L-&yD&@q8ZrEvUkPPwP(fR>L28dHT``>cq{ z$f6PN?1%Y!6S>h*0eM2ibG|>5;?h-pG#Zjm{(FZe@Q{$XF%-+4v!l8{(h>fg@A9Q2 z>IkTwPH*Z@DNQb%Yn0nt5MKCd9L2-^+G2sa-fVmGBe9jCPV-+_vGl5GGm0*DG0^9` z)Po%VU@+VL5c}YaOXE;BcaPSapaQ7gU--3%MDeFLaX zZa6R1{kf|`KiHh3tItKB14fS;BmC#zNMT+VKb+B1A7H<>^!cFJw6EEHQCDkb`O6h7O>x2YUc?VbrF#-@gy{tK|pUHA?GkN!epJ%wNx4+82Tyk&1`wNa4 z0~}0d7TX`2HKOqUHV+5OJ$AsKa~s|)@r67oVxAHRU);CbX+EhwMZ*I}Q52WIEH(lH zcWJ-!=`B4C3=)4NX#%Zan?_3m!8e_}MeLNuY6VPJIEMty{7&aQw>qwkwGs@U4+E$X zW_KJ~1Xi=rhLBYX95V~{7fqV$d5|7G-#(aGrLBFYG7sD9hWEW-J_s+Lzl3wfoctc$ z&M%;0*2Y$N=Oi|;k8OM8=cPUfk31U3m3z`+W9^$|ly!P2Mc!VzRnu_S#2Ln~xn2Z) zm-N{uN7a<`wY2%YUMn;`xO0qkkI_%c682x&{~}ifP+PsO+p)V;j7OV<@1zep{*+G{ zq8M$H70CEz1zzIGuwp>XE?{r(Wdyl6WenfNVHCtg%tfCPiwOphI6=@jUBmtm}A&D-5<-gXs zefKWjIme;|E7*(xKe3FUp4of*>7Jx1z(VzSABHK5dF;Qh#tAj)0lh$MxCjRGKO=&3 z(SIQo#gxOASgH;+sTkq|WCpmh8!HR}3O0q742v{NZpW2w6d_M!LC2t0!%Bas?hp7C zFMw_206c-{I=E?^*LX#UTM}z?=qI4PB7#E^{ItwtGhg)vIb|NzBCL~#jn<`4iSss4 zr7b3FXD2{ln#~@k6M#~VTeR8P;NHy>*Xuyys|eXEXyX|eH7IaO;f_&QkiF8y{Fg-Y)1tA>|EmqSe1>uM0fkMy{5&0cM-~BQ|2o*wWhp41l!-V zs5Npn;R1-t97yIqRl0QK41FuI@hLs@=0H&|3A~!l?Z9^0zu&9-hFJJ`EfGknupoD0B_2O(TmXwA$sZ^sh4DtrZH_ zmDF8=HmZDqkjB!EK>K8)rG)@jLB@&Q(hyA$Bq`WCrFDMK=c?1}-TsLL;-5n?&OTnrGIdkQ z`pn66zvdL69NCbsX}Ez~Fgwe+U_MSpC_rq%`p!51*3WR+MhRQbUG7hSq1wUC5OOK} z;`l{XD$5&4%#*G9n*hrxmw4aE-={P^-2})FxTn;9h1dh&McF@V0&`&Tl2B@0H2?n- zFr?YUg0)!#5T%d)WkA*Z^s)g!}vUmAlM!>wX23L<*{fCq#(sgNLJYu`^J%SO`4|{(m z6f=$XD6x`hbW~KmynD`Crg6+|!4B%ko4Jyoyzj<{f*gxRw8qS3`g@f_BR)O*{sRpI z5k*U1OMPkFPm)~Ols=ugy#RQk6}QX?J?N`w(w|%VB2yq;Vwg^7wFijOnXp79>m>K! zL>mY(%fr5E?go3L3M~5m?s*GvvkocTx$^oYV$ZS0YgV|5dP{&#=>GKn$$(Cn-Ap7k zFYZhtmLlo(%JKe~3a5UFNQg~VsKs1SuGnD2-`bT7AVEL<=m**-{4!sa?4D8Ms&A3T z?d-R5^UW+&i*xIlsQv2(B#zNNRUr z#9mcu9hs6cYe#i-6T37NztUYK9N}@+AWj4VPPK?bfO;Cyxa{UKv#tD?xv_ni;?bjO zQDktGe5FXxc2Kr$HY#_#bmD>Pr*2c((&m*8aE@tq5Y=0bA$`Nmj8_w~TRHgMoAD73 z!N#+$5-(`q%lIJ`;+pk)6vl;~@_4jC&D}ldve@o1g&Y&kSnuU*gdKc|cL+ zno*!Xzl2;-1a}`_KeF`PH5GWCoyQ}sztaM|_`>NGXfk?Y3_x+5%vwu8q&>)3xf2)U z%X9Rbg9I1!y@sDU-|MUMCxPX_ym8}&WVR_~dOa|);Z*Dl?c2S-YE?scsBQ6g@eggG zi-ysn&+HfCq;#Pn%)eSGkiHll`*OYZ=d(%0tsw-c$}uf>izrioCGlNBFFD*vGj5uW zlNN*blb(GScTHuSp7XmXVZDxVQH10Hd?Zomc858@ClUjzfBEGvYWdhAxCs|%Ieob= z$?y9d+nZyl@DCXIYx$r+MvIjlEbe}NNaypTD@C3r;X+ms|1~YmnvV#9+i=ab$c!?o zQ+;eqAX&_`xZ8cw2hs&~z=u>DY4T}knyVU>AT7Em3Nr0Ogb08`s@*v0m{SiE5Hh3) zNE_1~W(ZDEhw9yTV5=h6qOB(aY|GcW(AN>~ddw8@3o0=Fo&_ z>RfbFpmWnUmu>k*H;V6WJJz?P&@Dl=o^&?%=uGVtT|2C=bL7^y^*3p5Ch+UavDGGJ z=sJn|aZQmp}^aG4;5 zp$HQgW&XF083&fSv&d%~HE+I)0)hnLOochwr=AiY)I`J79_37s`Odef#8s=&Ujx)npZFQ1l-N3c$y658`(n*xN-g(go3Lt#hZ5-o5$~iTI-A zQTihJgnFOp$6Ak{fFjikal247 zA`Hvttn}YmPv-&LS-7`cr&;?>nCkx3ENL=Ua-d@1M+}V-Rdr&IEwsgiR|Ac0cB%o9 z_%Q*J4yqRk1o@%vR@7QrXBty)fh`f>cOr@>+PZx7jxc-rUZ+&o1?c|9_#bG~2w#51 zyuL?ToS`Aitx^E`b3KeO%7p9equ#WX%LmFjVlYN zq)-G%+?Dr8m19o~7w)tXG8NRXXedGfz6KBHBcI>lWrbnDa+3w|hrQRuGxh6Kd*5Q0 zagyzmb@_izL(>3;s9v~T$Ug&B58YZ=kO+r^(OCKnniv>ALx}3#XE%&lciKPM%^dyc zc8qRp6vu`NGmJ0mi960zh_>7N-l=F9sBz~vvy;p~=Bu!uJKJ#s2PZ1)W}_$NA$HZT zUEg~7$8afhM6cZ^fJK)1HwgY847d zQ^-QbH6{p2c1SU?{(8#I@CrigDQ+nM^@`zX`0Xa`#2UL0(TYRiaJlv0IGT0BmlRpe{|=t?2WndL|bMn>T^Sq7nymID8TM<{N@XK zSE=B-i}Q2BtF|9nmup)Q+2-WK?TDfe;iT>%mupnDIy)tKjL^QXiboDfsF(IMA&Lzt z4=OMxa4U|*LB?q6)*+-_K7qAwcj)=w6T2Nf{X8VVsdt66_C%k6I+=IdFWtWb;P}cG zO8;)3ViEKU{>+)woTX+D zfAI)`;g^uFT_{AVl_S3%N7_*};XaO9#`mM^9MeMQb+-x}&f6C*k*&9n0ILof8WCkJ z+W;5zZBaoIhR?^BVmCcmsXCFJ^q@=66EJ|SXq^xr#9zuZ$TNQwL%xvU&lZ_vA7!Jo zz(Yn1kGuQ{re%%Ne#aYW{c{a#U0?^MLruv{f2_nVDrLI3FS7#A`{PtsFQ&FB)zx7x z?pd^AX#%#~4H*(Xv)q7^_z=t?aK9QSV92VO+c~UR^87O1#o*-ZPo6avLoPgb)nh%o zVQx+ia4fJ(r_QSvQV+uXUf*`nvzn>o4d-JdIM z#wbqW{aN|>41gZId7Js<(|TiSWPNh+r>VnF%&5C_b+|D5xsrkw*e$1f2UNW|f60D) zFobeOb103yMtz;_EJ^tyO)Z9Sx#xNcYwe2&D5{*ICpqY?I=7>`D|CJ8#T(X#6cu{&Kza_HKmi_E%4Q+xLzSUMlwt^(&75gsNvep1`rYiNAXX zSwxM$D*PR{88$f3q)94M0dNjcT$(U;N9c?~2kd751L2NaJXG?B%cHm@sV-OQg^RW) zNT;eM|Kluq(Z9G^1(}Sf!bY+rh{KB{97^r(B7cqEt}0Ox(09k+xOL6;QGs0iQ=2da zjv8z`>cc86z)5FakqY*>R)w)m!#8*cTv&8O)EMr; z{3NXoV?K06z~zADQuk$yyYx_X3JY97D0C)!kKl*@N2Pb~-_GTbovu^{YGOHxi^|iy z0T*(7Yv3!=J*^&%Pb(m4{>Md9cZ>IT)bq>Jkk@(g{JWA{529T#HZ~kl63G;rjVRT{EwQ|(T2tUMLqkVD(7W3a8gfG)>h!`S2 zmZ>gr#d7+u)@hVF63(584*f6kHoBBk8AzeuKQLEM+|vY2@6Rv}LLy zGG`_Cb=ux#4fRuUE`{6=eV*_DC6dod+5^2kfHR3fD8L4z+)F?^543xW%%ACHe>dA; zN^Swf>sv%pZVSWhm{ctLNS*$p&@i*<NP&{)=C1=U;mT6_==3M7zA zjDMoazDw5wb;lkk&ItAvxT5~Uow6(7;!}h_&w)(LYqVs3RDm=xV+&F4|2k=Lub!+6 zEgVH-wd2(E(2P9~H+mLb?f04b5r@b5oglvRlo#H+@@GW9fBk!$qbIc4Yus%KNMo*4 zqbcE@@$Cb&Mk2e7e93uaij@04KOHY?;8;$N{<8N_{E{;cl~zCM?xRc0pIGF8DU}7A8xq#EblVJr z0~?f9%6X9FZFVN<36FN%&=RCccw9ylqYc(ELhrH$3ER-*y zincWr58q^+#KTY`$fTWhqr+4$ohOJ%djQ0% z+1?jr_r?RnE{UEIYejg$FEnbn6}HA9V}=Lm@-{1To()u#W|YJzlI}~p zAhNi3uO=s*rW}veRm%E7Qn}|ZCf^kak$Uc@CY=2I4*agIOhwRGFH3_i7@05ipFxB;-(}>;&VG9mcT2ky{8S;m^e}Pwlu_`k?sd|) z-ZV7MK@qLNt)XyOT49@`9xW|&lUnW~u@IB^`P9nVOo5Ik4L>@^?+N>%#7@!&Un4Da zrOt8l>4bIZK=3ZcM|-kF26Rp3OJ$Xwk4U(DBA!`1IxpP`R%_Q8OYA6bZG<0Wn0fd6 zhvyR%d%f#24+dxACT#{z0pW!dgaorL#)`LJN=EgCy{aTfv0ISgs1(y1I?Vefqj&}J z>f*NXIupA{`CNW_h^O#u{7S1&2})7meca-D;!9_vE3-m1Xx7|34O|Dr$NW~jwpDn# z@#3Q~w1}l|D``C_bgl9M25(#Ac&?UVx!b{V1+* zLWK)@aV?hH`)kRAyzbH@z&5|Kf{&7Jt6$ttk`tdG&O4u)n77W&zvHTDi6dvDMa^yZ zt8bO3WpN&ZRkHol-wC7rxY)Io67hvIm;KPX;@_wjj&hloe%s`VGNzvn^OAY%W)cTU zoKjF<-r`UDh(;Fn$TIhOreOp>wG>H1v1B`9A%wGG-n2TZ=ZZaw2K<#|mPXKF)nV4) z2@cxIrQNO@in~AYelz>^nO292^La!;m#%>`r8=ubS_6nw{C}SU=j>jD<8dSklbEtg zqEhoA#(VvzzPhp0iUHMHC#lf`ImT|_>GL6CSqaq6D+nDYn!uua#F|s-gS4EfF4JzO z(`$#~ru0!iuL&fz48BPGJ)P!odJ<6%UJ*7anK`Qu{Rgal%cM4Au@THx6Dk$%lyF_g zTK6aM5<&vj(i462&!dw~)8QH4e0D#|ad@gJR{Y&LV1@v+~Cy%WDPKV2@H z99)Q z;r-cKi6mXgZD356Sw>wRp`DKmiT2S4?l7D7_HBrH@z|JA<)w{D5gDoDU$Kk|jrr;} z`@|=7c4UtBbF6R!Ki2dwV5EV8r1k|eZriG|#&taGJ3e5vXDgpJ3+E5^0CVq{4at8A zU0}X(nS3h3@Hrq`JV^>#^=YATSQU*gp%Msq%pOhXSvLgOBYx3;OK9i3{_j2c%QJ@P z50!2&$CtDshom#=8~C}e;$|LH`_>kNcMScX5(`dc24;HI<(kv#|7+?~Iit8P{p>Nm zXmYN@7M(?p>Z$tzb&h@BgZ=r3G6JflZGY5Lvl$2_x4|>RRT_&t*9oK02^{mc=?=s7 zgfI)psHLno$NquCX4F!2qNJ(%4$q1YOR}ghnIQEakGWz^!^4Xc3vk>A?+h`E<7b9d3&Z}yz0+>-8v*PxzRUl;nG)EsX5O?8kC}cIQe7pl^M77MkWj| zt#lLVIY1TDn)aaRV69OnNVIJwT8gV1o@M-#*~MJs=mCfy*O6iWeD8jHnouRKLhU+B zd-v}Bbqo2qZjNmtQm6Z~6MK|W1F`I&Zf4hF#}Fq;&Mw^-;cD4Pw56oq1vcqtYQ39( zKFP1Lbb)!8z$@sclaWRJ!g{st8{S8&Rkq8WeU)pk3${K&tX;c$%H%CDlWXIJP zFC9y5erRg*#}x@gZjgp^7k?Q~&<^Nz1RFX1z&e4_%&q}U$W;$(Rud-c7n@bz%udZD z{-Dy2U80~5BOR67zv8U?d44yB_FKoYq*0qN9TZ{|N;tPzP7G8VmX14@SY+HQSCkdw zO0#4x`B32+Ydt>B*^}_>Y(fhYga+V@Cl6hxzJz#@Gr#J6O4$MAn=Q1 z61dS6;1S3{Ye&?J-ifo#Z9Qt)WTK-J%Q#>J3Vf@kkAXaCPag}4o1UnW>6tn-Nwu6t zJ#0RuiE=h1eZ5}4Kv~8^eIEoZ*K-xa_m96MQpFOF`1R40=Z`y;Us(YzN?J2Zud~+p1#x6Ymw@@f3-@1Lg^f#OCGQn_T$yw{(&07!FzDo}T)unXi z>{hSScXq1UoNnWO+)mVcWCz@6^^W2Jlr>VF_vFeL6%fY8Kk9psN_(v;;OD#i{EN+O zNc{ar8zJ+L7s6xR7IK0+wqe-yR@R>-BXsCG37$=){u>oPRDpP5 z{N?7-kcAhAoWs}nMW3il9mz!8Cpxk3ihNTtvsOVcim7Z2 zW4kMFP#h+PO+JMc_!052x)ZOOK<5{HdmVgzm<1-2m$BQV5-VIVURkc(n&IOc;t$6% z)vh=xXh<<%QoC0ONk%)Dj~65uFDio(1ny&ly&q1eN%!R@`=r_}sBH)nJOlhn3i=e= zSoio!RVL4=D@8UenIzuLusiS=#TNCk?L`uCDXOQiE-nm;;P6y{pK<$eW#Fa$KK3de z=(EI6+PAxs)~?x+HE6{)Wr+Lmg8Pmt5##0^ifp~;T~I-lz($fo0G<@@89LT;{?bIU zY|+EE5~~R0O%+}IvSkf7nXT(Y{&(=;tL`ZrW;n>v?#*deENk>-A7*#i0S~n?D)o$*FGB-_Z?PUH!CIZhpQ;OSYn*Kp2p0{a!Vol9 z+!O(B*Rq?=R&KM)Ssm@OCzrvm4J!71ON+3NR_Ht5Lge)ap=}Xhc26Pe{fm@Ayv$P3 zO^>xH$*06h8IDxs(cj?}4*GC`)V@3-2RpE){PjFIuD<4^Q5w@RHg468Juvl`Xb?U; zRhqe(m!s4mG1~WgDo$A_VUaj^y*KG*z`+x5TRK}~lRuxCJ@=U8v(S8GmIc ze&w4gw#!mFAb9UhNVAX?G{7nO@9Ljl*?R5RMhU?QED# z95}YJt_Qz^q=|N|`VFHgz3sKFSqMx%rw@F(L2U2Eo(J&Quvd{AGgpLK6~&rL5f8B% zF7&&p^_Q2(oA?q*H95H2q?pQW!bc1`^G0|`&ri~jquS&BD9H{NypO;bawS z1bfTS6zkUqJDW&$>ZAS@BBerC&#n|QuL-aM{HXWRE10?)F4FGla8LxX3?w%vvG=Ru zchZ%zRwc#6#9VaE)k6%_TSTn7CeRX1&NP&$G=#r z_-C;Gjuwnxn!g;rSCXMu4|l$-VVxUl14ERGjDuRH=#H=P;a0}`MrwgMyo|$9gik+) zcUwyfLxMnFq?r$ZKQt0EQ<7hx9aM%Xh20*q^YI$5?%F(IgPUM6|7;C3&$&X2HUWEO zdE`e)zQ0SuSjlBiZ{XN_I?111`W7RSJ&DiCa?=eH+XPh#Sw=i-8e$f!yJaXdE^I3K zfTVw^+$;@>zP``Mk^dZNw06Qt?@J2dyTk0o&K$iekm0z02cvSlbW+I8BKa0Pf;K)_ z+k)%1vZRO1RrfJ8n=<@du6^olQBL-SHyk&^LZ82%{z(l4JP^LX{feZ0@@3&K`Y|*7 zr=E0(>Ispc%acqrmyQ4-`>v!&ID$oRPi{ICwKrINWUC}WW$7Oe`%0dp9MvA}n>Cu= zrMTQ{)Rp>u3ylr??I5m86ra$B;_9Xknv#7wgbuVPSY*Stz~v!@tZq#FctNWa{DHKv zyQ<9ji;8AX9wUhIqqv1x$oYWyxUgI#PWC?DKpQw&*kgY#1R|8Q# z)c=PHsDc$6&X7tr&BK4+a8#w0lC7FGNEoy(rXkPta1;c$-^91SQdvY<;AM94ux-O^ zA{0YIW6`;o3JC~Wpokw2X@I2{BIW&BI^iYSuq>gyakE8RBXY2d_cp^sa4{FtF z&1&tfMku8y+ESwwHA}6cHVIm_YR?+6N2yV2)JTllt9DVbx5gfkA6DAp3_@`rTx&1I(OoICT^6@#$8 z>{6OJ;`<3pItk*f01&wmwBG5$2V!ZijMlmD9d84qjc8y9{%&!^$xchxUw-|sX(IuP zD%b}~2MtGe$Wy!?3Y~JHy;p~Wmc?!KG2V|aaW^-_E1M-D%WP&vxZt{gbHyFR=99$I z8j}CEy>LMkuFj=GTUa57su)2j#@U(C4cs|~@q{EiS9&UqUJSb}veHsvl%T7rOR!rpW0wR`j+?K70w9Nm=i;+?3 z_&?>$$0!G+-vCM;^=0X?bUw(imhS88ly9M2m$CD(uiJ>jxj(7UVGF)8mRgY001lZn}14`oB|sVWixwaO&K2 zaspjDny9!5*+ABR7wme^6QUs~{se|jzeIRGoaR3>)gOb0otPG0yo?SNg1vXKssJht z2_DIWG1TicfR^s@IAq+USAv=7)})Y|g46gb1PnWV=ucvW)OehRBGdkz{%Ek)-{f>4 z0gDCC@e+b-7c7klFJL2Q(zDvMIfdhJIF9FJ6`mMFm|Y1vJ3d^ z#_N6okSu}sc2(JESv+O#l2c~OyjlY8AEy5sdlwCQJk(b3Nj8LwGO+D_fp~v;KT>aC z_qGbw9TCM%)P|IZVwAMlX^P)k|HVMzH3ATO(Qw*A~#|pxhDRY zx)FzziEFycXYI!GIJbk&hSD*k!VF5!KXAvzmd9;^5}1I;1mX=!;rsQpY@e+m$&q!q znRXel7oa5rULcFYCg3yVSfx#)bAml!ojhkt6X# zTtqpA8rdSZ4pQLDtFF@S`1V&sVlS&>&5>C}&AY5U9`eN?hyXd{zBKWIr2^~>O?oX* zK0G??f(U=;2?u*3@{C8mMJ)*6_bl7$3Ay+14{YuTyg?)bdvR@OFj{Y-(z9UUw;i%Y zL%a`YBASqb`0+D}_9OF7`-&=^^Wf#YjCLH+gZPfK?7y1+ z;M?@;c;4^-ok=vVpNnC)DT#f4P1`?RC4(<3cU4R_`j%iMoORmoMrP5${P-9cSEB_c z34l!m5%-3s;W2bstZVeQIz^YMe?$Aq$H;00UbSAM*gErR_aSJVke49}WWLVw2L4Xt zY!OXHfhe?IM@op7;~TGhvi<%Z&&oE~9jvu3P%y6LiY`@Sox8XuW|?jQG99D9F;&gW6GWwbbuS0rZ>+&sUR zBz_18?jV7kM_D?d-0meX=gkuN-5KUn1cZYImQX>$Zt>|a!TKYD%O-;3-M+O)kUGQV z;QA}`P#S1%!?)wm%&9&kkLmK3%W!#*99LFYHS%*4NYXK_NdI|3}T-Hm}lXY*|bN#1MZ=cw7%gGX}@NluK)?Mg=f{ z>QsWT?B zuW(z?Jl>BTfKkZIe)=oqDEmHH)!gD-97zd$CRkq1wkzV- zyr9|+Ky@*6d9dt;8l(gHA|3f`>{DC5>l5W#??yB9<50o%4!X;3)9Ni4ibF_nAY4=G zL4@Ab*w$Mw>&LltM~VsTuf_m4m%wByz?~52>wXuIttjoCAbW4sW2L?kK#+D64ocPt zcp%lfF0_tThaaG?ZV*i6#xjbx8zgFzk8AG8L8&R%`t{>fd|%Rzv>Wsw|?<_NKKkG%1zj zPpZ)>aQB_mRoJO5e+K@%9~Q|A3wV}Yehiyh3;b{RmUDFm4DZB0t@#JJw~pk87ThEi z?ox;20twl;t%>m?remWM%qE-VAS|xb+OV_aW&y`CJ*#!fh#o1-y%>29mS_ZL#;Io! zl5v_HIQv45!$9Nv0HQ8t7$`O(LdrQuoL=1Yevw;JkMdn zl}P#HD>d@P7u!g7Kw||4V2+^45RUfMwhP+4ri+Yiq!+$uz3)LJK@cygvkiyXZ+_7Yzk5vh9kNa z3y_@#_%J_g%_>hi8g1h#YryqSQWe_D+i5>$5o50__{}1978xFK&eO|gYd8+_;nziO z8f91nh(7Y{y*b3(8*)kbxt`$?m+-#O$@8G;i4=6wCTVgTW7SHC%rS{F<7=^r`c#R| zk^Uup=VFeQU$-qUw7;N<{9^1&W8kbRdU^D-G}60M>i0+FNYPHs5#EUAo!b|S>|Ep7 zNe>GfK3kC?uMus-;768i`k8H31ksKrCc6+ubn@m&rD_VnIXbRu@@Lh<3XI<|#>H56 zhHX^g$zI?U<$8~~(L8=MYxSoUL8uv^B`F`koMGQyb?OWzY|IiTR<x@Q57bj(9|+_nY7U%cr0HRb zieB?}WlQPtcZId-uRPD{iCvJfuUHf(*OgGdlCKL>H`iXb=nhg-sT7X%4wPg6ST)=_E0YpTs2cg)iy8*Mrx&7~7r` zT#Td*w;3sm*?OJhfCgQZZcRRtZWwf5gi+QkSSgKn!kVK)_IBY%5^6yvkii*v&Q1JTd{s9J6;gDfIH+5@UCz>fg7sR zd7Roe!yJ77Os~fs_)4xF=wdTUhQ?^3rlS}5CQg%ctw_?t9yuR58wD(iWL0i^fV@fR zXUn%-I{Kty?IoU^T0R8$iDMK^@crZ34yz#KqYz}vbew@t3l?WQoFVp?iAv1`tgzL}3?&aMb0_#b3XgS{&FQA8mo&)PoeEa%(# z8tS4@zw$VkD2JdYA|71CL))jA zNkAn^WIWM4KAtc#bcqQjra6O&W{1aDJ4X$_Ux-5~eSGz9 zSykSHJ(J9q_^g%xjTWE+q-v>#>m5$uxp~k?xAcrjSO#yK=9NeNkb4YYlGdIfml?4i z-jMjehAaHnsyX#zHPLY6zU}H*$Sy2}e4BKxknrjwViLu-Ou2&~d&oWg?!WEBG7n5b z8urDqFfnsJ@!zb1I~=>Uz36w4@Px$PdxeY zYsASS_Pmpry`qVd230%$c;Uw4je}|Jz2nGqO7V*&w@H5ymXg$pBvl3zxq~o^G1q zGj{Z;hrO6UJ`P8s4Ea~9gXU$Mz2Z62?7eZ7#ZT;Mei3NRL}Pp2k91wvG1_WZAZr=a_#sY#RX74 zK{UE59&&WrC10`5I=4SjHmEKI1ln1(VFss5iE+m(kFUH>p~TL;-nVuo z9R^*5ZP_o2B)BC^*7G=t%r@?Mol1}V1!(<;|C4Y$SP8#sRyDHSh_s+s9t*P$(?+@V zQr|&9iCu;{SEz{HYo4cA?&p8sitvt-811(}*UT;!2O*N zTbn$ROKP%w!5$_0+kC}ohC^dSl)@Bse_8B=mBD*^OIbes$R*)BIEy!sh;gh6Gb^1f zY|z}2C<%SeyevTUwuFNkH6hC#4l&z*Hn!6laDEdW6HeSyR9u5k|5cu<+R5BDfO7#} z?2{Oi;O==-+j7<|`YVT2X#_Wx0*`sU!*I;q=?;+N8NgHeZ(3%qhERU{+w{`1N;Z(SNz_ zO5$deJEl8+IGFy0Ul#9_qtvTIBKtC8qh+W2`mNX=51n$RzPE zHUsC{`zV^gnORDBfO)6xUOid?tD`{+U)Rz;NF=Iv27=!jS>->lKmrw=Gf%JGiShkq zs1R=H%6C}3C)>5brvc88w?0rS@K{*uhiv%s2kMgFPp)&5LBk~UZ$dG@iQ(ZB&AHi% znC+PJ!nl*Pqv`)+QShb8CU!R(NKF#pO=eV(cRpx`vZI-9JJ#!M$%P}ey z5=!o`xp$;_EFMA!ux*_P`M&z10ww<+jU^uDbZY!#QhcSLgpY0yDtrX{I5TQ$GX*>%^dA=od3oE6v&M0ibI&rP!o?K~!>!$a2zaH|2uC0+N0w?XawVF9+< zf`U?{2Cj`dSRM0&L^qzW`#T%D?)C^%wrVJ`Y`VRV@Sd#7*H0;mm^oWSM-8 z7{f8n-M+V5Ag9MQ@dkGC2d!nYWrVR;1JDuLshm*Lw1i}}DUPuAw{eI)31 zK_m?=`Gwg`1v7DKkJ{yI6*PqRd)bqUYc^QC{CCI2ZNhk zRuXJ$LX`<$yK%qi#;j&eJ)VO#z>mussIafyVg}f27K7Jt3y$XjvC`H~`}eUuiQY>J zO&Aj=$8jl;|Z?P0Qn6rsTkZ*(58WDpba`G@^ai?01D69md?Uvktiz1W1aT`>MP&pVs|4`A+m9Y`O@)yRiFDR3rcW&iR!eS5^Q$3@>?1 zNb4{!86Kh#4cIo&K8M8cQ5&MkLqXhN`Rxjy)%sgxN=8PTC%!iyIiTJz2Q6zaa~m-) zsayOBE#<@^*Pg8t!JtcqOY&yj)6rEXX_vS3a&Jug@+?sL#GS2CZjM${npS@bek60_ zw9zPm1I57*l?S9ZqZN{e~xJ2%eiu8D71Hut%~%?{hb>f@aoA@j?+AYazYu;a;k zBdZos)h$BE`U&x)NzmYgOzgp2^OkbQd4SEZk{V zdCZ5AQz8L~nV@0V4yFmEe=@IUo9!c9LIX&vDk&xBRBI0gSqtEcorfSoIO|*TH}Iu9`J+JX8XQs3Qz?SzgKU zC5GHu*1fz!tvf~5R@8?d^Xu&bq0RL|WD*pvwO96kBGXLQ)jKVpBsiQOX0MqPi;V#9 zu^q=hZBmRiZZ}Z5dfzlMhR-=}+IMO-HXYErHO^IoEj9%TDO|zwJFTO=-7`P1!=Wel zsK48twDfL<1+*e=$w{*ACtBELqi#{?+%BLv5hy8pq;*1M9Pw^)&aWVi+YVF4uU_YA zH(NaOKov>euNo7LA;{Ib=4x;$-I$S+z$C&f)?vg2(O1W}Om&DL&)1~=%>Sk`3F8lf zKfXW@KF@fUAUThsx@ua?hfxzJR1EP{h3pwMN^l1gMF|nQi#LC^}<*E25b3H?n85iFVi8HYzt0oVkX5 zVb8XQk!7hS3u*avM?i^F{xM7xDr&>t{E#sAr};EFHiJF82v@BLL>RkHh9G`yy?@eN zeB)bPmy8g$Y3@?OqdFyhyimmVu5c3W)6B_JhnJ1lQuh|x7%WaRqM`Q<0Fsix&r99K zR3GCUmyOG%v`?)m8Y__V-wgIqk3+wl)-b%t5IOhweUt&F)GwV4drOPWaDDN2Zn0IY z@h&jS4;6+7czL9?BC^}X{SDYl;ii^gr|5zEg6+}f_z7QV`K}J>HBs@^{QfhFZ-S2ECjQdCv?>ecak5_$QszlWXpKuLFB-qxl$UX9b^qGffgC z^8La;?;3Gb_RD2mg#N}i#)HibRkvT6a)p06vq0e>-;bp~(-G5`_~Vx^1xfNb>g?;E zyv3USQJ}ro&dKI*^zx>O^MXHM`sc@Thmb8q5VY7JGnPqsKM0%65oP|YP{uE1qw76G zuIWVoUk$s~N1x_RfU~kuPp&|=zWmpQ)6#CMC_75Fp1`$h=)fxXmQ`S%fjCF*2%Z~_YqWW&PUn3f$US)hwb?emXyL`PyJT2 z1=FJTYWeGRd0abb4;yns5?kj5#nXOg7uAYK5pF|g-xw+TzSxnF#L5o-%SUT-ozcCtt7lt799o z?T;^gV>6h49*B+L^=%`R`?{Z>s3qrrDLTuvE2I_)pw-XnnGm@Dj!x}~QqC3(S!$4*^^4fkWC)DvGXgLrgeAw|*Ml_8(u z4duHi#svJxlSOZu=c>gVj5k%^T#G&7-x56^++S+xnMpn=emy=TqAodR2W`Ksevoy? zkbfQZP?;upKc(E`8B378yjm)0E!{_fen9nZ@}BP@=}}tjGM1XfetJzR9dPJXl~{c} zq%{mMXFJ5^?HCb7jd827(qzk(>$BDCPHAOgsh$_wdxO1UD5|~gPP`5wbrtD_0Snx%$;DZQ+11}O1h)B?_EJ&y>bT%vx-uE ze>VzN{f39a-zzHIk=_>Gp@`pCtfcK(a>e2HQyX z_1A9^&#eBWh~Be#3WtEl8M`yWht6-t=hS|v+s>hZZzJm9O7gAKf?MRS+U8F)-op2 zWd4{r;*HOp`fV(cA-#85Z!^=h&QdH$!Z#HALm|E)9<*tPf7Vj)9anVdJ##KT)mk*;T@I+9w5nu^r%69A;5C}G0&QZF6cjR!T= zK*PQog!X>|tpzN+0%wlDfK;rK-LnYXdvkI6`wOJwbsfI0Dz!L-a##-uHnqDj9PGZu zHZt$&GELB<4rrj>?N9EOm+Pk`q%D#Nbbln_bMK6E-0@{gx)0;0Ag#MdW_T2+Y(6zA zVO+SRB#OIlnI!Fv>b+bG;C41(_xMzHYSc|!aEw|UFJ@D4v|J9KE7dBGP1~jtweK=1 zi09;=96!A!`s2oSftZ>D1h)g4ff zHJeH+T_ohI41ddFIBIH`&Yb;T@QN>GF3y!{TPSjCjq#P^20lkPPHHL88fFk7dwzQn zE*xNzDQHaYoMzrt{k_B>{Ci&t20(9Kxy`~bCvO|R=49>Y4$LRo*pT-`_e}#mwQSog|X1s;@;R{LdImr%YCbA2~nDwRM*Sq-|Kbk;cyJJwG)VwKW`LJm1#H6>=)$>JZHS^g~0#&$$ITNc#Ls zVo=FoXu)O)&Cfoij)4|3A7($FeKCQ?tqzss5|b*{(JD=~o&GQ0)TzPFHrd0yW~A*_ zdlg@&>~%ahpzMjolE6l5>KQ&YH%(S_mD4(yq7M;s;^(K{1vIh_h(q*}9 zYOhZEYG+Zi$=hhBjN1P5L0?fIi!$LqE$Bx|n&z6%{e_N^zISEUz2Dun#SMnZ@O{}N ze@;jLLL~mX+#Ql=+?Vju9JPLR0R>Y=MEJw)X4> z9tY*nuNTH|zQj+2metmjOH$UI>Ul^pP9F|10{<97Qp0~NbU2{vDdnaxC)>9_BkQC< z@2L6M-hNjT%Q&@YrLdzFR5ZOc`18C>@~f=N^D`l-a;^A9Xivnj`)FDNt>14FeMh#jZX4171 zLXrzC#0fcX&lL)VHkLC+#-$${_j8w$1^Hi; zJ$d12Wryy0W3Mwgj4IBlWnnOU6zin?*YCco9DzMzm8eRw{-#SoD`evF75+8JB0@!^ zsW#`oqMw4h^NGKh=XlRmbk|5|PFxzxdbfJ05;}miBaa)YX$Qf!N-{DfelvG(vt6ft zJQA<(F=oVAyU;t4Gm{NB(|ijv<2+i|H1@3&(~pa}S-rH|Dz(J4r<>xbU8A|CRhGhnCkGp^IV{tTZu?Fuil- z!EZ@1^FHwl+(~O$LEX~4A8OBF{@T`Yy?8S(Pwvp^@z*v=je^8j8%Aq#lNQ^AVGzG0 z9AP1|MPfy@YrN|9Xy#uO)$8?3`j2l5-8>jt^U_4W$*fe^z=fFqTNdbr&Y4c}uYXQ_ zTxqz*nQsH~g+AeF_&MLO$Zusvrhz!5J9r(Id3}$ed0U7DCUici`jqCNlv!NfpMl;{ z_kQcD-QAsog(NWRtBVHQid{7_QJr(nATZ~1X6*O5HG_`~rczSx^+v*3&r@pe@8`&g z$2?}_wlsXA#}u2yWMOPASL^94#c^B5_=)LL?5HB6XYmNzv(=kj8Ff!f-o<+i1B2_@ zEh-5z($d{AhEui@;eVT?WoW+HdRRFm@<-O$+IR0nrfd0@b!wzenW@U5jw6-92czGo zo|adiC8&j-f77aZ`RNkF+}Op<{#l`*!b6ulLbasUYaXxa;mH1`iDP@~zUkx`Z-hCg+^PpXrL!pCisdQ78>%JPk*Twlx&h|5OYZKH+OQKv@e13W9NR_Uv2D1!IBx#wyq|NqPD17sDV0LCSN87~sq=s=9-H?TD;&a{Qh+%tw2}Sz?2)}lmmY8^m9qm zuby-AA6&Gq&GU(q@iu19UM@)qw0}Ys++3c@^LxbFEmbch{HUZiPBqJ#?Z@(_UU4yf zNHxI5Mx*%cDwW%Zp%wUCI5zGsze}x_(8BYb&HYr_P4LtH6W*ZFq5~(_5_;?3lHR6l zkC?T?b4iK{jxdYLkunCF10kXcqg#)UrEdqmAjjt+o!9?Rq$0h+i6DK7u&;A(-vdwi z5hM^EF@FQIleRBDb^2Slo0oU5goSBH)Dl>2z!eRi_7>a&0Nkro_3{o6+c+rI5>v1} zYm1d@D|28MFf^j|SDY%~y#39IO(@I0m&xu^W2rdw*~!&t%UBe<)t~qeA32TewKX6R zX3(qsHs&H*>{^s`bl;sZ%CoWt|7Y>G%SpSc8l&|}X%L?$Auj$|5)8>`@8M^+`cM)) zUUKepsOl?=tgL~-^qVxOB*%jO+d1D;3>I!5WEg+#FoHtYdrmc4qZt}8QMudF?sJk7 z_h>$khOtw#1Sh8_-_vkGg1kl4u2G6xeKDWJo+N)t`osB#*kjG)+xYcqQg7l{=#XPP zRs1#7hC;R~xI8DU>75qUG7cJDw#@J7uEhdlC09!%;q>x!bX>Awz7HxN*3K!Dzm=4y zhHNNHvK$eST%e!}&RlL`m&v;B;`Bq4vBznQgztDup(&Zo%rkpB^sK^quQ*8{1?HgT z#&&&Kx6Etk`rScD2D*NGX7RD{B0YyWX#hWVOp)(6(+>BQtE(ullk zYX40o{`7N&5?qe#r8o%jBwnadHiYA@#n6~Gox;ezpvI3P=G5wm(*p(naWUvaMFi+h zDT`q64;;c1@5D*6*B_w}FD?>ZvjOlqomBlXZyS^_R(jAty%Zm|(ogPM&2;tBvF^Ed z#Aycvu^H$HL&V~HKM;%&lFz!fE<%8=laT7RB+m@`B+ujxi)D@GToF-)KW`9XevW_M zX5NZtYvcJ1q6r~}W(T4EXzKnRVic%AZy3LaL$F;wV+p!6zh3A;8Q#(CPlTt@!6{fw z*G;D3D`5~5Q8gvo$0DaYAEb;)6BC}p!}PaR61v3(++M>}Wx-$X9m76pSq7UF*2o+v z)$7@`+pL#Z=(8x+zr^wEzCeh$xQt3K#C{v#?40;`d?aUXu` z)foa&nmpg@o0Pj3P!DMQQ;e+?>xy}7{WnTbaS`|;>{jj9h~ zWN8oC{OQ|PLw zYoC=n_2xPWY(*i2FOT`I(63FZWHy|CJ)&3{>(D+7Vp^(^SZvML4gVX2C|#iNEk_EaUqy79 zAeP_vEL`iM(fAgxGW#T$fHMeII!npj3SpSZ(9|%d?Er~;1gk_+pZ47D36*dUM z1Gj#W-9cm^rzgze|8e=xHh4g}f4R@#W+-E^z4GZw(xhQgm*8KSBHK#3(} zCnEImJ?C@lG2hSY8aO?jng_9%NF7K%D4-B`6F0Jjx?ctZ?SsbxtgFM>$UgPtcSb`7zGz>p@Zn3|;3pOSPRBJ`Ei$93n2EoY8_2PhbMftrAVYYD7&aBy zLXGkR)>1c~j%upHojf84Z65t&)L2cv$;BR28~+s2cTW_;U-2M;gEsAB(gm+RM>GK* zR`W)K>Z{E{X!s4zW3TN}9FkYilm$aG5_nGA-b*Y5%iA35+15H)wZ&GARYqCKUBdp$ z-q7mKGy89X$Aebn?G+Itt!6@&fEd*~?b8(FS$p$?i_`xy6cJ%gp_Nw7Ob=NO%sMV|BVR695OV~fD4s7A zJeH-3*%E6dnq*_0et{%78r<%w=@WOqgG3z{3${d=?Ey6LybMv&Td;woU}|L}gvEq{ zq+$py4ouuvn7}%Gz3XWkJH9hw=R|DDHY6TZoc0PVbUIugmz?=mc;q6K4o8lY>Ug=h zjq&V?j-07r)&DWHyo;m5aa?M!x_q!cID+Gbd+OMxzlG=G*)K6oR zPD_y@I({nUqx0sd-NR|6Z+8&_5On!7K_9|oj)FLQmwm@faA zL@+9|DirG^SI*>-Dv;zskRT*z4EHc6ylB(ZO)&CS+|-FbRom)`csQ6NVn1qi;>RTK zpl1Abkg@<))os!FK`mrLMK9N6T^(do(6`o`TT1pT-$~>7^H}sJnTCWg7z7Hb$q94e z-tBG5y6bv9tsX6A-EI&*dCTyYAz@k0*Sn;Axdce2mfGjSK(~qYnn+h`88V_ zrTgN<`4~g#mBBPYh*<3yvh4%uvFn_PYtuEW{blO)qbIau+Mkr&{0UGL=JWj>s$I|7 zNRpEDt_N!bybk1a!lUO_CbGQV(uHt$r&XDZKuu(CTHlrJz~$^`1ihRZVcMDiG|YMj zo|=NgyquPnM9VfO)KK0=g2eH35=V+7Z}Y@{^09Hc`>^HDXK%-N0y#nd3K0%<ZzRDOMnf9rO$j`P;NzrK$KoY@v7m>&^_*sQfD+d0~(#kI$XB?pWyv``qLvrDiY; zf6+0ELZduT7*ym4HkMwRxtE)nnh^9Oj-9&>Y*eg_`ac-07P^+)~=a zv7{hAXjO%wVo2Tjj2S<+@vYg|ZSif0te2zi1L_bUhlb}DptbThh4oG_f$XwV)g?PmaJr4Z=1GigC`d-oc z&r9S_pjrO*r|K!Z-an-xM&expN|E|k>{;~LM&1TPSnq7|6;V0urYd-7A;XHiwE`;zrg=uqjf_+3%UiCX*12b`WLJqbkC z^=U@|JP>xtFX=-z4M2!VfHZAcx<~_Yx{j{`&DXR%-ArG9mCirqw%Ene zFUu3%OWnqxXSs#tpQsr6qfPIB* z{+a1$hf(;eqh$|);A8wtq?cHT_oDTG55mtBGlM&1Muaa3-a~E5M3oFNIGaQM=iBak z*Y&ByRPuzrUSP7~)Kn{0H;*C`8b&BXxv$|RVh>(Ss@e&)Lzid&6@$56p78G1j;qFo z5>LvK9~cF?_BSc-$m}Dl#`Y@T4KXR#-DUH>RBY#^D-EJILmLD1R6@fg{7B&3~S%h?nbr z>$ysutuQPDAdn^ag5IWX6d@yH90pQe!#4Rly^M9c=P?(2>^Om^V?0S0VkR2A^q)}p znm32+{3sh3%X=}{B!Z}(&wcQ*bP`6>W>Yy0uB9FR?#pp(T`;bgOCY&KB{97Qw%8KE z3ZzyfFvwVGq4r~0V&OVypUR4aF&30(Z2;Ukq$$sIvsnG%eJ6^FjhL6~-GYl_JFX_7 z%%+RD_3?OX#TcuOeX(OH2#kq*yzu#Y&85l{R_1!v+!YPHZkyMh0D`}sk)6qNdS9wt zQJ#G;nuq5jg^aAZFA*sOzbRyR#tkSWuNJ3@_a)Cy_H)=csd8Bd0WrwM(~4JHra4BTh`Qf44*-prcKf!*M^`%*m>%inzr`1~uPCP{ z?RKrRvGsj4U9rR)JnX^}=K0_wBBTiTBXA5b_;-D5BWDRX#$@)Mas`pOV7;Mxo^)-F zM!mN%G=r;~OtH!w2S;iHM3Mm2-otcuU}`vyVeT z%;YGgy|d@C>xrLRU8FM!!Ib-xc-6-6+Ixw_xLEHy`0K?ZzqS=gr;E+&@{qP2ID~l= zZYm=`WV}qOV3RQZPvBH&Iuihc(BB2K%rM^DMs0Bf4U?w-tsuFP?ln3V zEk1oQygXp1a4YR3Ko8q}4XwS#NMWIjYUuic{O+ZL3)eFh#4&*iWfnSh&`8RUk}TRD zuMEk;-J}tv6Nc53=4+h)!+idMvBk!k5xh&sRTBxnA7P*rRt#zxitnH+v+WLuosU=x zkkTTOzU}m#CIxT#xH{K1ztJ{5Hb@Kdv;S?)!@Zr+kHiztDXU0`a*jvnIw z@)4m%J*Ke$-?fmxZGMsjWv8jr$fnWLZ)5Lkw8kj0*Hx~ZVDWEk{(}*W&rVhz9ZTo; zwLQU2seio4SaU+vOwg#s$~G7&sWJ@!{B+X?H1$={bo&j54u}uhwShQFaA#s4Fuh{_ zittE!CaL&_nou?LD6|YYM5|yQv|P4h7=Y}e;oAkQ*Nwyx~!{S5m_d4^~ei46BiDpUqr~71xf=R+DFA z2;9_Pyl%%stwR}G^PS@*e+i>ugn^5ugRZYVoG4s&i9E*eWkt$vrWH>9{5GiCzY_PCSJ(&41p)GprKLs zNP!QbKai}yPcl?FRfXR^*ReEfHV(BVG<_xj)PVG+H9;I_C@V2Tl{^g%gt?En9$iB* z0X2?z>utjk5Ir9j8UbV#T#UbTf$EVGU?+g|w&7i98tpvJ>9}Z%0`UXJbm}A*v$2mD zgvmYA7Z-wI3b%4$52())j{0_>3ERLFmPzShv@q`}`|+BSxwf3Z24VooQ=HQQqjYqe z(wN4bMCR+gv4X3ddrYx!RTf?M9pVWoYsCFjAC z2cOUV!@u70HvKH+l5y`#O>-uJzoSciBz^J{)5S80(O)Y0(`*=&X#lOV_Fg^GdA$XU z?5A1<+`+S^EME(;q!U-VPj#)^c+xp%${ce6*2$Wb!c_p80>jJ+?-d0hgn){XpTY;@ z>s!ax$9#~WI-KR<>>}uEx#aoDC4KA1VkwHPa6ewFXUqY9nW%)JVr`t^XCU;=M1xN_ z%N8=VY?YA!Ch))Oi@&@e#%fmsGB%I%l=dI+t!cJkVL$L6^JA?81ak>mSRrR4-Lo_aWG^U3evb%JQHp(`znS7!_h58@ z(GsYF6FK_WS{Hd>{fY9JJ-uD#wwwRTNw7lGTBK;?rS_AgkS{3uHl&>i~UO43G| z7D5Lg7ACs!N{%sG^_Pfjs?t*ErxArM62jQRrfDh~h62nDz8S}833rT@SmO*ue+4Tz z01_K`YTwG1!wdMkQK1=e>=slDUp!)!)jVs;1Hua(sP^VbFm<4+WM?Er7aL)_DX04R5ok1&47ekFM$%5b2I zYuWxQ(=CE?{IJCa3!HhkDTmU`Y7`K#;W8_OB-XpKb=XpDTN}XqLnCHuos8is@llEd ze!QvKorAgHtR|6U{r-dyA1kliGAkyQs|$sv7@QZD6)tO&BGMl){uBr8&g4!TFgB{Y6?5T2rAv=HU7)#FiF389n52n1Bt_+&aPZG}Jfj?@?4`>=n9zK*o@_ z!tmqi9kf%@ReD=p%N7?^z)ITYsZlyQipsQRR zKB$wYBNNMqHg|9=;t~av*RK5@14IW6+1%~W($l5$UH1R0>dfPzj=nzr9fPsUzHiyH zlaZY$BsFpaK%05o?d1@K(U7Dvia{Nz0XyxYb;3QYg3Q8FkQ zq*0OHE)S8Ye@na=%eM2T{^?5uSnSa%_G)(@=Y)dPAT>iNgN58SBn@CClmuGd@Qxs_ zB{kjb=uc=px=)L!V4&j1Vgve}v1pA}OFhifvLKg$k$a|A>lNkeO{*^L}tCMoRf?{p^kiw^13cGsec#Omv znTrmBy}X!w_;QF^dL3YkutRgkS*s_?UK9c*mAwAfSYD!Ov4+^Y+vuIA_l&4R!e3uK z+q@uSnI~1*Zk~7PxTMBnOUx0}He|#v#YB&qW3z}&iDp{q$QYn>@ zM&Y= zm5_^OLrcMu8En2Bv>QIY-D7$M#w(#@st_FhYAeOB00ZI3G;xtKY6r4iBHg1Jfz2yD zrzh(3Jo%Kf;&OKf&)P{{Cl^I2up-iQKjy^UyVhT;XA2#PHH76xPB|7eHB`gQD>}Z7 zl1w8n(dt;0o{~O|10%RvxZKx1?itcuz!^|N)}mF$S9cLjS!dFdJE_c9F`)@|0PhmXcIc5npoYx2K;dl!)#Lrexg^*cD)!-bVg@E%f?y!d z0c01jRt~N-y0G5Ss5oUiAdvqDl5%*+UvM-(E={{nJgHF)A&rR2B|LugGp%2|9Dd|_1KqgL&w7abX#V`ZE0GGa85gj5}qjj+4a~aR`{NF(&n*;E_%5bFk zPvH5bv5I0}ib7V@NF}q;o5DtlqR&9a2=I6x8cPxqPKgh$id#a=^w3@~vF!UZB7_JS z!1*v$>m}V!(9TmzweGyD=V&li!~)%g__L4ddd9RtPhO{$Qv0~xzEUlZ$-Knm<=PR$J}sG8$M7fP&|}z8 z%HJ0d3A7KEi&(zQT3qS=63rO?Bt|_%`H`;9nIGhgsuTQJ7Q%HgVx7<}Y%&fF9hU<$ z*IDl)f8CG(`y1;gXu!)fi!c@M`6TYwj7%2Miv-=WX>3I9AywPnunhkLNBvRWtMHz{ zh!FTl=ux>e&WvlaZ_lq^(pQ{Kv~_7N0+-!nlBS4? znI_&(je0kchQ|w|SwAO^9$Ku5-U2YdDbWjj2&H*0SuNB*DUN%Kn=fgvHC&552^p$Q zlTIfzB5N%R5*77%a&Ng<)fgcBfNpR%X@hL^I68%?*9?iRpk>Ar1sc61KA$!QQ1I$U zq+co95fGqY6m1bTO{!apHNyvhcXRYX$8IEL1X%ar&&FRp>!Z1q-9>_^@F>EmWRx|V z?pp<9hIz^c{S;CALxadQ^ir9R=D?n3?!V}W)TPQ$2>D1oPu9?4py4uX=Q?#-+CU8- zN2K%a`*`|(ZWNn2Q6tqj95LXbAfA~u_%xvdkcT(;P8zcT#pOx?R&cFYQLX%E1fWxk$k9LWYp(KC$}>m=8C?YrWN1D zMPThG=312V;>=iH8pcCQ38r%HUW3yFc;yLdCU~Bw(H<^rtVW9jp6F@TS2N}F=VZBd zO@R&r&|xp z;3K+JJqNlQwBg$G`m0^V9#@(Y{62p$rQm`mRF=zB0V}f4^Bp6?_r!jz-2}KC`Z#sg zs*Tx_$fOHnMrK%CV=*f*hrj__(%t`5{v&e(AG9WP?V*|bnXfPa?+?6VN5b5jcRrBB zQJeI}P3MP)D*}HAwp~|ELewipJJ1`TfIv;PlWX5$aFdr%7{(XJh{up%6e4mJSj(Qp zRGODS{7Y^JhVh+&&IZmZdAk$;cr*Z)#stYfdyOc#TIuidmCk);HN{Y@wr&wYl{h4g zAC+)RX0>NUNv0N2yd;_vrv=R#J9%`b#_UtGZnucq4vm`{GLvDIv6U%C{Hn#E>hnUg zC?Z-R2P$a&<;+AHkc(qbm$~4R|F_V}d-wIYKshW)I7@@Z0Hu^@mPIQ;NXRMt*UsL4Sk+D$` zqTMIQrGajgoe&&#i0-Maj^X6lC~H0rG4`Iy3p9iYLePUZl1Bc*zNo9nr@0_*hKaL*ryzv;*`Y zY+$0pK!8r!Yij9L%IPTX|GN?)AV|9f@Y#=?J=RlxzG0UQAuXbvC+!v{WT-P=)P^gz z=3u+|KF+%z{*`vqoX$eW(a~auQu3!s*jLuM9?&x*;Ir@ZyOc!;dVtD*=@!*pgi?Wx zM%%$@lf-`(PbH|Q&vG(J`AQxT2W=qPoxsG=yh5MWIknZ%hxebtGdGK0sc!-uCd$*yF}_<+ZAuk41eQJ8K2z!BIdrSpx^&M}?mxHIdxU z`H*dV3xW+7DGVI=r8@o&5t$ihLJG4+HfBXQK5Saw$R9k7j}#(!pVQN!r)hfTi7~lx zY5di8RdqBD!ykqz+Fw2K{{pHR-0|D~b{0SuqK|hJ1q^-uK)#C+x}*uiXs}A^+`$pc zC(LeteYM+pA*h=_Z20m~F3$4=oViM@Jr!@FjiAO+pC!`i%MXmS4eZlwkPH*E!DN5v z0$P|Aqf|d(en6;Mzj>_ZOJcq00iY$DC>=TwoQGa+A?6SRJ0e6+HUq4E`9zm%((vP? zeK|%=oG?e0i6rKfh7NAzl25H9bbYfW^5bsIE`1#3eNCM8tz7tQjb<~W`R$%M;%oxsu z___AFf2y{L8uVZm)+U50)2Z+D1pmI#_4kgp7aK8SW_j;-?u4^58@ZQYw0gzPAyq#jP zd@Y|IsRb!+(0Lfy=tzqdSvShn_@RaS@q@Q5D<|{eV5vPDFnU+(V?wIWq zMmbbTQN7aFJ>98gVwrebuLVbdTwcqJXBLK)5M$%hY$fqZ0V35E_EE3RCg932id8B= ztVc&bV=~@XneZ^w&n-c2Nmk{9ZHzX_NQjNa=HmbtUa!B2lBIt(F0 zVR04KPfo^5jLVOraX#;HGFc}bQVV{71jlokNtA6G?i3jWT?pRUfcJq6I}y1Nvq?l* z#wn?Q;#Z~C_f}~mC*INKEMLBr@=NQrY|kv}foEjcMP+^u3;XLxLFp;iH{75?u)7IZ z&xnH4otqUZCDg|(q?gy(-j_>^aNg&}*2^$wL)z;;dr2Zb*V~u=gYX>*-Tm@%b&8X` zie>qge8*_$XS3XeiOdxQ@65`Q>PhVj!Hxc7-^opS97ZUrZO4TaJl@Vl3Z|brn{%V? z9O`5Gt7f|5Xa;%yF{@IKCJ&NEV?*BXL(piam&I3rLsrD=psYVyt%<|<#IfwMAe&)& zN<4EHB$4p|3B9xNCTBdOok_5uEVrDN>Nth=wrN$gw7q9o;qM+&*I$+j53nqUIzU@ySa0itgeHlPo9*Y5 z<@RYuc$GCqN^FbcNp}B6BXjIhl~ufiTc))AqG62cjZ4;<@3C;*El#bD*-Aw>CU&ZV z+gl0@;y?|p)^h;*kS+h;cktMV6nxHQkc{8pkwyE=%4=KfP5IOJOKXjjzk}`&IAgx0 z_;focja?r-%re^ng2dBy)!#4`Xu0xk7w0)1+k5NT+>SY2ptkM zpI8DfI^1XJh9#^~2WG!{h9q0%>D52Oef>;QjzCTS>=O2^HTDB@xb}VS1%ty$Nx;m6JM&+6E zdyFTt2Rp`FbaD8bso!%kvHp&?Ch5F$H-nr;ooZWc+}mYbRzEJNyL0nim6kRL4p6By zb4zG>1{(aQLm^$2SVJI1|I-sl5~p>tlI;*JaiCwQ!Mv=>kWo z$h2`}L(XNU?SP}|9$kNQ5ZP46>WS;In0~-rLn^8sZuEl`)N$|>H-%!@Xe#%2iN^ug zy!7@8TW5Ksz@MV$%zj^W+^XfTvi;S3Q0I-0vz1RMTFMHw>yp&_QF;|N{Gtn!o#01u z9HJVmbleV^0lnnF8^oh4xAg%iS%q+l@QdR&oLhb>T^b`?m5-Dt3BK=TW z2<30BHBf~;@ci7sSx=Oz)6Z&VYWq@a5Q9!}M|(<fn!E=)9rtTY_rzNPl_|DERPoR3Fjwa+~U$$ys4|tOR2|-58C(b>*S8XcWz2CEJR&Z^2%Zm!|F*FHE*~Qj5El#N)MzFgYwd ztXUcnB2~0p3%b(dK=pqtreUW_RgnY9CX;=!Z*Z@&eCSl2S^_`S?&P$+7cw=1pV80{ zlMJ_7N3Ql$6n9OrTkAV=`dj2MU2iS0x3FXz`gg_Az8QVY=6U=r>q7(Bbg=F<4YE?I zCPaX78Mc3yk>3|zv)wh35bZud6FF6)2>;?TDi#HZ@Gkt2Hy$3dE4aq=Xy~hnw?Tey z&vgZ9{n(FbWzQO#v2EK8M|XSsmc_=d2X4FWHY>^c{(AemTD3Eu7^heU1KrVK?u##E zj!kWfi!L5L47x8TN_Z3YriXG)^Lg_Krw$yV&*0}e_JYBQw!RbYvrb4N+bW27eYewo zJ`*fD*Hu>+ zU24gGGZcJJ`y`1GA`fh4VLxizUi{V6k1+ZAZbWEd6)Lx`)^VXvB6OG$?|D|XCBEa~ zGBW`^uF{FgE86>Qk*$+INzTujbzK2pHv%N2Kk1t5Pdy+nn_8U}aMLx^;gwg2M|mh# zsCM2$K4|7(Am5n&%UOLiugH3!85XJWtIw&ipsoldaYHKemDBFMx7Px3T`PIvku_I| z@m$z<3iwd>ByOFSqD~)15lC;T!53|*$sKkhBZ&FjFh?|LonQ@g!W@g;EI%H&H+TEI zS;_)!+y^p09MhieVsQ8bKmWROY56&*QYV}jH|erH&-&iO>6$94>1`MDS8(I%;G$g+ z{c4lK@{3yA{Ce-~HI)tFc%wuS9;Z^FT#Mc7Wp|V>Jc`LDTW)7F4&2kOY!+QC4m3hc z^J^!ic2(V})+N5UDB{zULGiI8s%dKd#uHhxr}k>0zx_tY1orU~VlxJK41r7}__@;p zWgoG63=j^yV-x7^66m4g=I=rL0P?bOH>G6dq~w(?<&;zu6jW}?Ny#dz$jVkfx$66W q20p&-k32*F&)}A&yn>1XR7Fwoe+`&Tg1w0bfWaMO-GGxuz9&`y literal 0 HcmV?d00001 diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDPResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDPResizeTo64.png new file mode 100644 index 0000000000000000000000000000000000000000..b61d92a0e186cde1a77542ef7a5e06c130e9701c GIT binary patch literal 5373 zcmZ`-c{CJm)V7r+Oj#0QNK(nZZ!={{mP};Ho(d^3H5l8N7E7{IB#b385tEdCj8Kx0 zE&Gfi#u9_Uj9EXw@4xSl@4V-}&w1Z-?s?C-=Xvja?oGbtU@amfBgDnUC1P{M(&+#% z{$~O_2fOl-rQ`$P3%qpo5*K%3iPmzi@4@--Ejw#VF3x|e1Y4PNa3cu4;to5&EB|Yk zd+&p|xFl3{~MC(8m{bf5v#>Xx*FP<@oXra)PK8FG63JBf0v&r zWj}HEzS>cW@r~JgHin%6vZlu_i_ril-qz$*t=3jZJz1#=36B`)>SfGM3#kf>X4(4V z1&SyLQlZ8}mlG4~0_9Q~6?`*zAGEQ-5v+;albz0lmlyrvK;6RompB_jPj;~I>pZW# z5t<7#Cx!n#y~N%=VhWW?M-*()A!>u^NikCa3w24D@$3AUtrlKIX$hM5BF=UqI%G zL0jo1i(W_QJ4K}7t6Z@>$^u8tJK_>_6F$W`e3@4aiRR%JtHFHp7Tn1^ee|bbCaq+Q zOVCEz{0i6bE}sI|+T23M(NM_Gx%muZqC1x@w`uU|nG&yieC$NdN4v%qmNk@G7nrO$ zq{o zM={-vy_eAr3zaQlZ}3P__%!MaaVqvkWIXPRZRI+v5(;;3^8FYih2>aMYmc6Z$8j4T zdTX)yZWDe&RF-|&daj8`%Y<^?L8I)~tP}d4j~`nYblkdg)S>J5aa1>j5MSyyjl9-G zPRP-0nBBt(VV0<(U(c9ZUSZ@gKV@L7gepmNiJdm!dO%t#g%>Mj`n;f4wdt2YNX0P(D6nr!-C6-bjkPV= z{;BZUJD-guSQS-^qj~5Q4XhNQ(WfntZ z9zRYhNekgsc-h^><;-*OoR)Fz^=%R2wn?X@jx(7E{&&Z)Rf*|+9oZBT+tm7RKUZ_| zPYZ2RYsP(xSp>bU^$Xtv317KRV0&$+L4e z;B^W!C$wiN!v*9?Oe{KBW^^p zB!$-sFA*7!p`6?SqxUUwmJ9~buuW6A@-yf0*1BP8NFxRCqA?4Ajw7!myn$W4&!{4V zWUL}2=`Uh`tituGsVg+e9NBP^j3({6wF35(^rui+Q)tNd0Wi`?c`jnlyvt=N4NN)> zj1Vy&YnEP)aVO}6XqPW}$sIlg=v#B$!SQqRg}(AI|Ilz}7i2nAoFvx7bLj@s>FiYS zQ>ja0yZLF_Oifo>`A7ASDr=kNr*DGm+K+iUTH!F$`z^mK1usH^6rn)mbwjq&jL+UL z@mjskP7X0NY`LKVGKD&9-_Y5%x^r&?I&*sFH@#oI`oV6FB(X_uDPYsUQn+gxqd z^_Y0hH3m3G-9ynUG?A{$c1;>nCWFWK?c$SN(Hs#@l6sNku5t~e6IbnkJEW2R^&ye1 zHxU}Wkl;6>=13CmkUk;%v@v0!>kq^MjW4ciiR=Dj-QbA@2S+cxTq*m})fGXW&}{p% z$ZSGN;NyJ|ffa{%W6tvpS9cQ9<8DA-F*ZN^-D-Zm074s6xuRV%FkafyUhoi6`e#n3 z)mEj{Qqg^L0;)3HVXRqS_%hqeLBKg;=W8l$)7CMLJcM;wcEPns`0weXTOMum>AT=! zall_qs;JEy8O;A^YD;QUXN~Z46xoH^z(P5JgtiD`3TYG0svP{9T0L3{F>H?<3MPFHCT*0lqnd#pl8sy}51eL5 z3wrgShFwOU@>!RE{HA(zJEl*O_e235viqgn*xR;L!dx^s`p4fZ08?INy+lh;X{GS{ zOgX@f>w1LvyJ}^~3C&A?#r1X0SwilyO8LBvBtJEXnPz52ZnnVdM}^RFaMyTT!|Hw2 zQTcp!Iy*A?;ERvSzX9#@vCCROV=vqDd|uGliJHwPssLNS+PbOp^rZLSG{Mzu1zb#} zvbO|z{kOaR?3YvQZuZAzR`IUcgU;~k*1&j1n*&g*E&YmyNY%z=WUP4$GjD#7eTyaE z%jh_^4%GGew(E9Ps>l_aK@Y@5N+QfRRMR`wj7bkBK(|1D70~TJ;XYqlrq8c#+b8Y3 z!MsWb0^P8L!vOCJ?q{ojg*vcs=TlfX1(Qd);omU7bn_yeWE;n}AO)1Tu}-%(YOJXY z0lHZxjbD}A-U%nKBHn?WHnX=Yc-Uga7sdQ!xlrBf{xoizUQq!@PH?Aj`Rfufq&d%` zW$g9q&k=`ds8y_km~2i7?CF+bwEtr5yT75oDC5vUop3Fb{Wj+b;~g49|Fz{(>E@Kvpphy!&sZ z1x2~DW;8#xe4-FP-^WjUO}FS~9a{tR_{BdTC*5>4vhQa3Z(TBO4~bs+wDYSy?yjK$ z0AW(@iO+#78^~c`1HO_F2|ImzsiIed=~}nt=;bi{IF>U57_;^NzK4rBr#Kg3#y?W) zl;ksOg1Q`;s}uj-cg+z~nOP-ojeS|YZ%-1~tXjhtsJ~F~7FW$a8>h`jMTd5;J=+R= zAg*wZThS)=xaHQfwD@I+sZH5m0f7ovg+B}aV+_d^>6KzxZ{t2ZlE+ug9R%|QT|&(D zg_`!oCtpU-_7FV!=pAxn0mN)d$+l4gMP1<0`I(|`pax| ze_OCIoXx)p(pn9(jq!y9GoQ)U8sdBeP4`4xh9 zUW@N$TAuY_=Vb0aS&n@*7(d!|hA4g3j9UO4d&~L!A=^8L+EgI!{*t%;y^&b4;_<7$ zwmLb=c8bHy(B36k#?3W^efx5TLQP=2i>-u4)gL>MhwmMh;N%M)A(S7hJESTq;j3j| z?B;Rrum@VsK*{h?xgriDLXk`@?#f#QU$41_xWlID(9B_p$qCmAa^pOAQah$y$p9H( zqP1`jMlzhadG@56PcvcXSHb2&Y{*9fbLZSdZ5uk1BYajP`j|s-Z`LWywoymcE#E0n z0Et9u<9&ODN}fFoC|`g}gzw|IU%RQ78zcN?9u4m=1BF`9pCHizQ}m88)jvR)+K%e# z=7FFlTXFigq?A=z47ymilv6omu!i@!`vN>qB>x;Xnw2=;iG@ld%rm3jB=jYQChOfILIx*dj#u;u_o%$0FsGnVKQ;~M=5_KU*yABD4Y}V& zj?@fkq^JaJa41$D{yTR4Pt>F%WfGT|C6yAjtT^ImF}+^sd(mI?@GW*Ees${u6bLJQ zO$Bt6!BUp22N#V7gDbS}!=4?NOuZo)4{1~v_;e53_WL{@90G2pm6&vtu@`k|-|BVEZ zb8HnG9!0@n$H9wY)kl;2@|H12%vTt?NX1{vTAu?Fo zJI;*GztHQ}hZ6NGJ5~i44f`7ASO`f8byN2QdBSTCBi`tEncYXcS4K#(9_lYSc@sWI`wkl_ zyeJboZ61-jWhZ7OyMJ?BR&f{|nB9g?($x`kFYcW$){aG{0DmNP$<}E|GVFP19Ws1^ zZ+D%PG}Ca3v)d|7l!cbZ8P-%T_0jk(+0AGj&Ccx8O5 z6yzOC-u!keH|CH0UHO@9c19HNZo+;Lv!-^Jf29~ORT>8&^}xk{zO4<-N3!_vT5@8n z4QV3#bH7OX6nlN2x#m(3$P)6299p{1pQ=*%io~=v8a_r5^rWD9zb7@mGFb#MNZ-5a z);H7pXUic%;j`2T`2+I#^dFrN%;JFAh2|TgnGDm-oWu#^Zf8lv5AUo;)S|-k#a{2>1ak4wlN~}d-D?kNd%0i z;;~AfS)Y-&ehI&=$KZVF)f+iudr{AhP1tQB7#&V-$jqOB&vSjW+Dp6a#7$P+4r}2g zW+@}IF@>BG5`M!4*(CdLQq*OCY2Zcd$Jlje6KAr!D8ndr>jqVSv3`8HiZkQwM955U z8S{5BZBvueJ7^vNI>r4F6yAf(Z3B7G8|9pllo;F|bB(df4h$aMq_^yk>uf%a6oa8& zhkieK4$S#I68+u)?2EV^Ik5-SHH+X$)B{3W`Td5P5ASg7$#S)?F*T}r66&1mv=}S4 z9@6*pNw{;Imcz@$a!vB{qY!ozcJ3b zrTS(6PZpRU;Ge)#%67`oC)q_n+x`yTeqYZ$2fuu%(m-X)2la{An+Z^h7N$ji zJ!pXBYgekTg!pumMbz@Es)2*%$ACnPKA`qMoMQZ=kcAAN`-utG4ICXx|G11_++|jk zzp2-0`+PQt+X9>@-bt;tk^}4BV(Vc>0$=9H8z8ogbEkz103ffwgUid{4ysn)+fx{! z=pZyLL}So1smDG8dpKlZJ0CU-=4=P&CS69F)<)y%0ZWC|o*>x1cUJ}8y_@t}YS06m zrT&5vU%y}l`ei{FMEzmuk&e$r?dGK9JrFem7JL81SmXE1Yn+y(yWm#Zv2XcNX=)c? zzNKoSP0`veIm!hU+1(j9UriJr6;oS4uDIgS?qD z*MwGi?KZ7Iq!;^xx7TU)9+p)h6+Fk5qHhl^$AetPReJ3Lvl!ssIEHa^P- zrw?Ie;V*qYh}@TkmE2tHvrNeA;xWdlm1+}J>2?xuu2MIUHiJ)Mw@a^XvG%h2$y4zF z;k?S7IARla#FVDHal}|%SbhtEg7fn$E-SG?aH@^MI{{XnO zHMMlrG_};UjhwX%jCJ*mwKdf=^^7$&58pXu@P8a0gx(1Xi2DB>;6`s!4ji~_tQ;(B I&2K*aAGc-ylK=n! literal 0 HcmV?d00001 diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskResizeTo64.png new file mode 100644 index 0000000000000000000000000000000000000000..9b66ffba3df9ff28c75310937f11a129b7a85e74 GIT binary patch literal 6008 zcmZ`-XEYpK(AImGMWU@1Jz9iC^j;&;OSDx(R_`opMU80DJ0VI~y_a1P1gj-Vbcq%c zix9Eae%|lj_v1VF%ze(CbLP*Sxiil*Nl%P*XsPZ|5fBj2>gj4ey+zgkjDqO)7NFiu zx&;z8ooUaErSCYue*4lnDq}s`WJ0 z%skdz3zkFfEEbQdm88_I(X+1wTLcCrlLQVb=E%6bIZ{KVny@-_fNQ=;DJ)hdeZY!(46 z#T=JeqOFS(?2pEkFS)~hS4}BTuEB{xc;QI$<+>@tNi>q~0<5AG>s+7^C^II|Sw9I1 zFVlmR=VhEn>wm75)Mffvl``^B*1FWJ&QL|qBb`g4s+&sBKl^QgBXC`%?z8f98p0rT;k>yy`OsMd(8|3v-HgmvT(IZ^(-0s z#`GE!XBSRw`GLGv_MM4lEy^^#B_RZSud9*~3%{*2ML;q{iY1-Ym5Z4osGhUtBNh6D zl1@sEc|AFl4fZ4N)A3J2-eK!>sW-~CybWJh{Lzv}q3d)s;~>$N=iITy4j9i3KtpQ#8!B!S)UZ9w(%(25rM9prrmMfk9zo>DfJ+4o?s+Ihld%TA z)y2kM4MDhZIZz5Cf6x?VZ9;;4c4@GQ#bJiFzIyI&Jg(RwMZUqbA7eqGY$reY{~UA8 z`^RmPUhf#9V@uOC@HfIAN`Rpc_WmM=AGA8GpB>6<91C2@OvWkFy^vgs5fJK#t%%F{`U1>KK@{P3hI}mBIi08@yfh6UDHjp_Q%w;?jvyRiJ&dVv^4D+%@Ye_ z;ena_6ZKd31LqC2y2Ja`u8P$Qgu7>$_3bkQs&!dE(K?#F6`9a#$`Gb<E1Lc$+d|WQLyi| zIC^ERpPbNGo(nqFZ=_d}SC83^)sVJE2(Ng-qI=5Ks`E@_#8q6EO_^!-O2Jwyk2;jSuWvVztHL8-C>a<&imGbBH^g;e8)Ux%xGc9 zpBotlg8HoSK27>2=}pBt{th2HV)xr;dPUaJHv+)>wt*hmxJN=68u=4zjoclcf%+Ow zoW-$XERmZMKF_)1Z*Y7zYta~@0Y>9c$CE3`+v0C7P7iYELZwMXOaP4K&u|Zw;H7jL z_!K^yB;Znv&cLzzmZ)vlcHRd~hQ634lPDE(2h>F6%wWi*h2^aWVjKk}7=S^!Ufr

x~0tGm=>mQtjo3{=CIa;WUH1BU}Oa9}J3q4)|x z&U}R@BXV+6vuCjE#a7R`U*#;eQ}Zn{iQM*=vD@riF?ffs(-q~OLB>Jtrt%DQvqgxP zqeZQUVTHSAwl;Dc?T2A2lKj*50xUjcg}qxM2kOdv6+SV^4T76>c4Y;24axrZ4*+5& zfU|>3SdLkw-yt~JM_9a068q*{T4cf@&X!yFw8qe}vb#QFAn^b$z;-sM1L{Ps>=j5TQU?-1To@c;GsX22hnN-+ zKor#n3KL_JhvIN6Q?{+){5~~cLshBUu$2X3u(a-G0f}XG4^& zv~v4>vafF3pUbWb%D8L}S+!E-(d$3N*>CG{+OH?Ac$k|ki!K6WMx3>;HB?-8vJV7F zEywJHyz=KwmOY_A^I;dnQ#=9I?3^f3smCNLDTo(T7aOs+4K077=KSiHVhfa{L#fG? zWcl;eAVoXllz3;V>gwUgOGP3w%a1C3v0H9}2@laoiIUD2XosLXMe#Rl&urGTH{40F zZ#3}h!XK&}|J8E;L20B|j+WyO7;Wv>Ps9H$)WeKLJp;!qB3@@PTIX4I2=g@uUw zd20(h-0J;2Yz>ac{mYeY?wk>oRrsOfn_=-T1>!^4a>aL9H@TnwX~fj99a5`Tp2(XN zs+U7SC-kDHtAG=B7!Iv09w zG1ide!16M!sx3H|N2V}K%c~SSbDxwxXYH6D9IhiA9NUI&TgR{y*Fw%NDSGmyT&$zU zf`skSXhxDfqx}G3lzEEJ>|bmJ7EiaK4{jls6Sy~fvlU;2Cu3nPi#PYtkp(0LvurUO z+MfVa)E&m%bQb=;hAzddx)$Ec=6&5QiE5={~BK(rgm8}&c z3j>oD?4l8obmMZDmw=blo`IBsGC}hjho+`FEsuq+Ch(0PFNYMxH}#@q2xAMSpCHWz z>*!NhJ8?FrdBq!k!weBSU72?(Jd3qdWQ${Ws18E$@Z{z}X)9T?n3UwIs4H}J7-9QH@x`GBi>%;RZA$s4dy zK!z_Yj60%z%6>4^c!t)2JUTsIg7GNVdyhPn<1!WvzNH?m8XZNzP$K9_lnSwfXDJX{ zngW$4rx$--0>8mAKN1-Jr+xh?E^#q>e?!qphLOZ%lYN8U9}_=p;tp}ui9Mzt(kRT; zDaYqu!fC@^rtKS~Sn8BR@u?OHs*=4x>nWoxxU9z`&Cy<8jV|?;g5A_{uW!muq}5G& zyL`*DK_`F^M>m_6`zP!H{x@^BtNv@?XNN7&uYJ>fAZWEyRiC< z%f}*rd~rat)g77#D@Uj!a)d;*BXM+kSEPyaZw;|xMjzGR_Jw@okr#?VP=^sWW;a$x zO9b9_3*Mqd?FLb$?wupwFxZuG+CG0go;?s(Go#`eKykKM9&gWr$85~h%T!p0l|Bzm zeobG_z6JA9)H&JsZ;G zmkh5?$<@OgtNwP5>&*#aNsJM^Te*JX%wz%s>_CBkQMOIF=8A*UX|G-1t0fcOw=>@= zIx=CU7S7-yHnZ>KRTwb7Ky{^YlMDBHNu$f8d*t@r(lZ_V#**n&o@BB;ljkoz-eeiQ z%nBq#e)}ZELFM-1D7QhIyvJX-nBwq_&*b{S!}Ct9bkD44f<5<4WB7RueviD(q)U82 zT^_vX3uEy*Hzj6bqHE}gcQk-K4=W84jQMZof6QSgnch4}x+Mh}BlXO)f7519tV4fb z!W~))zzj-pP1DyvQfP2k5O1CjdsNdjhz^GpZ0iGF(hY!^9SED^W(Gy0H+6;qr7PEg zOjjXqmVTy1#FVje$@eKUztkf`Qio07QOfkz2@LBedx1%#J!VgJc$A-tK9J368aAZO zNj14t4a6t!-vG844(_%MKn&UGx}6qsje-Mb8$$G^XS`89Aa^=9fjFD^PqM>+vv2f< z0%j!jFBocnMEtVRTU3N-MUwv#+pmPq9{Q(!I4rBaUj45==MK2(R{D`WT0i&;DiJF? zK80uP^tGi4mGDkTIHkSD6b-T@LFVq^`TB|TBf4iBL0`Z{U%|Mw09pkZR@zX#* zt9H&|@qBeDKHLz{oE&*Nfx8ti8pl;5Y5O#d5s05A^p$G$@z4%&eU zoZQqB^j1Agqi3*YEJ~_aRBY>dV?Zp%zS@E4bT&r7HIEpG8pe`+C!?J*^N8$_(Y6XP z43T0e-2hTe`KKE4D9Qp+u@oThP;q)`wRR5>rwSu{(4B_HvR6^L)Gm2-v{u$vQE3hH z6!xcRvhmWd&~P?Hgzg)!aHe) z&o#*t9ws;{&YIqcH!9wPV zJda>M(sUmh{`t22*|+8VF|Hr$zt7;W^QK;M&n_S9F7^spMN?OMVFZPOk$pd|Sl<~8 zEad+aQ?*2bDl3KPo~9 zluj8|`0!;-@hoLdw=e&*{yE^xjQ+~>65g(j1gi<`7$#;wZnZ!7l9QjUsv(;(;Go&> z!y8uEKpr(sd$ylOo?|)L+=Ld7)32->@(0je=s!axCZ|d}9j9 z9LgJodLTBogznvzi+*T5Q*=ojT_ky=TE)k1i`Y0y3}gYv0j!##@*}4!@oIOR;`rVnI14OPIb!D5-}CW^ z@)B%F%f`}v_q?Y*@wQQZxHjYqo>-do3+o}IIr}l}WY8`B&SbjOY5AL$%KI)8xpyV1 z|9!BUZ>bkyGWaR)XXDcMec$A_NWSwxuIT1?be8bDGZzKLpEY&c0~N;in~ZROB##1v zz|sm4jJRTtgP@6?fSQ1tJ=#=#?JA9!4TI`7YCc&?4;5LI@ic1^(Dr#tV$l!iHPr?G zvpPX@5^f2#s?2ttC4dT$n7?yHtqD$l<=+HKdD`P2_YZHROsz9f4a5%u$c2;s+Z$p{OT-`OI+Y&>0;dHpk=@iq5fe+``T ze52gn9Mv8sJrlE^c2X=yIm~31+ z8amfn$}!cQ-RP0CO79!|c0ma{k;c2&=X6&d8~s(1u%6kmTO~Z~E?d@GdP`?@1-Bb< zw(Z%oZwSF|QeJDIL9Yq}Fxt}ZYNWS?k3kTf;;nuGE3R1Pp}VFY2s<6E0oUqel8*1j zawM!1enYQ$xY&6|&5(^^R*au7;i|jY9OKz-zbKk{=M9$1wS@3T_lz++ne!?0j;eAJCbQogC|pv{p`llX&~Y$YWtV3ID^ru}D)(>Q|&@3(AiU3rmF!wVEs2y zC+z;{wc9>~p2NIP$#uUaoxWqwE(?DnG;Zbi&EDwkE`S+`}zWgaWsWTAgm2G%T z-|S)9{&}so`>4_jH}#r=VwsxO`!BpY_FydF2m8FMjEz=rUtad)gY2^=y~e@%;dqgU z)>5+Q#>MjQkZ~(ZjopBb$?4AMm0OKpKkWChrV>T3sGgtEAjQnXWyR}dsam5`Js)h(NxANt;Anp*GvGqFDfPBe?+?0yz(CgXFWL7MIBgvWF7U5CGF#8xx2|CL6H4D1VZuIbgJ{7X$l z6V?TfKG8RI7n`G>M6p;1P;cB_8zfHY;T}VgDJ85=+&^1>Tl73UZeT(h+bwkU{0a`? zsmBe`&J!!;E}P8aQn-g=|1f2e9yDLG%Q><4imjf~gbO?icL@0!g~`SfD}~j+-{|WGc bfIZ#9{{Mi@>XeGx00KQNW6ef&N7(-WWDbu_ literal 0 HcmV?d00001 diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png b/shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png new file mode 100644 index 0000000000000000000000000000000000000000..eb20b2316fa22d8180cc685bf3886bb5975ab2c5 GIT binary patch literal 204254 zcmV*AKySZ^P)s$YQ{YmO*sTEy{=#H+6 zQb@&O6-^P|KJVYV_uON*OVc0L`OO@ed!PFNkSrCk;{|zf&dHsbbImo)m}AbB>9^uD zmcA(c-=zOidRD5WOR16?X)NVZF16B9Dy2sHiS#d}|52p#hok^$CMB0}oQfp69NpDO4gY@4? zzmzurdyDPw>{CiFOaDUpKS}qI(^x8`nKY9oQZB8eb7>)cB>hY2f0Euz@_$wO7t)_f z&q~LVq@}bcmHcB*%A`se9y(Vqao&PV=e=9vNZKO&nq^VR(tu&J)J(NzR zJ?SIqf0F)3>Gj_-`M)In*V2C_Ju8i+u~bQ!R7fY%C(7&mQ>l>}sYT`+zH1{rmOhsLjr9MJek!eF@&1|gymTauq*l6;u2_fzX(ZLs zQu?v<-%I~edRwag_ZHjVrTkx({!8iqEWIktr9xVxe`BeU)>4BZ`la;WOaDXIpI4>- ztMuPUFG&Tas*il_o5qq)J*!Z%Y4C z`XAHg5np-ZC%fV2?XP}W+yBX@Tp53w?fUB*FK(quS|GO^>$5@VBPo|kG;tp}ch@I# zF_xy1q+BZaRlGta)lw-<_^l)r(nuHsOyMEQne(jqlk<6r4T1Xj!m}7cGB$Le0g00c}%FGN^ z67N!4nhIwu!Dg^|TK=)IH7nV0rIbsxv|+t|J;`m`g1=AYfAUwfPx$NPBG04>yFbEI zBxzqtnY1s>rMXnbZ8C)4umlTfA?4BxaTaWwu{ncld8W0E|2EPJB`?gAFIj@_?aG}m zlk->k)*EL;!;N-*GUDwVHEel+bkiqYER18Q$Y9aiCtTd}-3zoYNhzsofyGHGc_8ik z0wnqF;k(Dkt+j_r*8fu4n1Ff)sFfzto;1SItl1I`-;$R1NwH7%L@&|urp-;3AcViI zp`RkcSoz#cLe^i%B;(zmk#sCQK<}sQen~m*xs+zorTt3ckxJA(lcwe_D_lY~uv`9a z?qze{HlBYfRnjT9`4k6GY>CkS{V6K!o98AqZEmt8h3giQSAX;DmVb_YuCUb+a?YeZ zX@(_A`B{kRTFA*rAuj*9xYU2BQ2z{G?B*ahz08s>0c|& zrO^|NbY@!Y!jB8;PnCb(mtt$u{uC9??arCW|5H{z?8KCaXM_Gn+$dA{urb{iFQijx zDdoTy$=WQXV`&c~pZ3DiUm^VKOq#R9L+k}IE2VwuP#QnU0c6rxn)4>5ZT%Jrt)&Yd za(Hoxy6u5~Y-}f{Q0^Qvy5r(KX5tue^)qX;cHerPWyyVB8l)onzK+f zo3udzE~EutG+exWUcT+yzzE+irK_Ft9|71*?fJ(@?nxJmoiQfl84(aQcK-V4&fO1z`@*ktWrRjY7cY3s|C&)j=9a99c zZNOKcBb)7uN8EXAkzIqTr8m0Vxp_hgd(th0pR7tF9g@~&NVKp^u7x*3ktNNEcvez@ z{-@`v(TFZ~6^hx~(~hxW6&q>6pWSGEU4#)M3oR}%K9?r{-J55IyOTw%LaDEA65)uQ zH)h#0UMeZl()53K^fi?Z!5cC{{t>7_pvV!Pb^&OGyqM7XexpamhyWUty zx5I}=$~&>pwM8#`D?fAqrF2JnReD-Fln&T>A% zVBaozk+h{hk$x$?E6Ublz_+j@@?3gX z`sud(N76m%v(k&wkyNr~D~ef-bS}Lv{cKzRg##Xee~E?2LFEf}(Mnpo19r0*)=wDI z_{OlLJqSWrpNzj=b8!J+A;Kf^IyZ*@Bk7LxlJt^vAEQ~J|B@a_A4=~@Z%8LQ<^PiO zqI8El7t#_~AO+WNOTUmF4lXXGBk2|C^U|$;$X;VkPNg@u7cYGxJ8O~o6_ETf!Sos% zz9We!ahj=mD8iUazOJy{v5x)IvNsFV^IXdLiVd5pv|yx{-1ZzmAstGemHtHff^7Mm6S|!qM_u67Y1xjrMsZ2dy&Paiw~qD z98CkxP$IL8_n)vOc9&jC>6Y|m=?|pOO3zEjfG3h>QY~FbpGbct{cGvRE`?IMC;h(k zr_vWm;R7&851f+v?@5hxF}V1)^kwN=(u>q#BlZjQ{-N{>p5kiTh9)0yXz9nsg0&gd zz+@-nOOR~%3A_AScFNv+^UHX7UF!N=3WNzajr5!b3|O0vEfK z-k1J@FRBM3{F?MN>5g=wBrALq;THws&vBx~i6~?6X^ewk=Eyr9Zg&(mC3&Y|j zH$iYI`utt#+tTky&!Dpv4s0oXBt4RTiBep;{L|unSNbw}Pz#UXF*koC{Q#vnA6)!d z>08pbrF$sM9QmccJCnYL{-5uRAtKEVqnLKi3Ph*?LAmfSWQ^-wAkGp)6jYRwoI0ST z>jGNfrz7(1hWK_$&>jz>0R-AvqFg$ZJ|q3U^iAor06_(gD}x~LSd#Q}>CG?$g>*~$ zn)J`5Z%g-akQ0`Ei4Ay0%88&7kaF#>J}>>L^s;ncdWwQ#0GKHhct$FuUr4VJocBJ3 zAqOos;EJ>{>G_l*O!wwd!D<15auTN7WUdqJR)sz3<l4^I2(6dQ*B6-`MH>7&o{_?6P8u&VjR%KT5)yc@elGOskN8 zZHB%8e5mM6HgAG5GT4~RUxD}h~Is)BbfYxl45*s+db&OeveM4u4 z^52tQk^WHniu987fVeMs0B6!IX(`R5x1_hj{)7sDSNe|hqNThk*0=-_c_BLvBF|4~~a|utsk`DNyWWXN}tfz3U&m;dA z**XnRQCVivN@v)KEMEQ&-ZKanYfIcl_6lPno9m0G(u>k}rEf|1r6baus91~$+>)t% zMfz*$;kFw~DZ`3&?#X* zM`1D+Y&76s#AhnKAbm^vy7a8H4+dVFF{!}z_oZV%{%-SpL;8~RjC7CRWyWzDu`aiz zry;gpuly@)o@cpW$!+)WkYj9EZ}Hp*u5c7{!o;Lx3sA6qe8Xa({{^`F1Q)pix*n5q zM-UqSUEjGvwrd!639X%gAT$=DPIsoi$qGCp{jT&o(o;s{ni*!X;oC2z=cN76!gTT1 zrEf`}kw$pDIml^2ZDt}}NKf;ATW*L9T1+F>{De1)_N%JtOVM#l8e<=~TL4 z{|DyQk{soR#S8=GGe ztn&^4I_c^Q((g#GfcylpG}*QTGP;({)QNB=JxAb^=maGWIKBK58?rYD!qUorR{A~Z zvxt2H`nW{5gh`q2G(cjPhl*dK^l2xj_;SrxpLhh-_VcP_eI9|zrZ;+QWxyhT()w>ehIq$BA|U=N27mI4keP3IHjdJ7wVz2{z# zUI8{el|GRQ=_&B55gFjbc2OH|{xZg^v~`+t|2=G$mr{rNpI&JpouFZY6`uUgMmjMD zOMqfM6#2-a=M{dl1`R1`g$YA;ZTY4~VN7M>V$1qpkw^Dvf}IWDlb)7t<6vi20ct_4 z61ye<*F0WeF5Qu);I1QHV`-MKKtp>$T)Ojf(lb19@CPPXpB(x+C2IaH`lY2k7wH6z~kT+Km9{ z6)|!^4Gorf2{f=6Fxir2xg#AI`%|cK01P#cHgy}=VCcrk>yrVNuQWwF>9)p6MM*#&?(m=&E3pP|1Fc&y1c4oSbR*ygoc-?{XZbjOP2k*1dB3@aSV`A!y4 zO|m%S$v4L4FQp6Ved!zrK63j}!azw&FeMMjjL$iPf%AcM>Ud89IwyTEv5z}EgQ z3xEEuogmdXpGFcDdo^k7DUXzsPFInOWPI}}W$%4ndS$vEg}uVHI#qt*BPkOyL$pIwh&}}1=g;0Y1m4gKKh)As&nz4nf2NW?! z{Y11!;qisckoE}BxWv~rrhrS!g4?iFC4p0J2}+zCH~~k} z$zTJx3I%dj(JV6JtIv$vT7tzCj$$5A2C8|~DTZN1NV749ZMS*C`mUvO0_V(HMoJ?R zjhuouc>b1WE0I}FS{60swUJ4yc!U!Cbc%X4bO6j*)ZBvL4uuHoH3AYAu}*0ugw~X2 zvlGRb5SFCp7HHdef$Jlmb>LUv>NS*uZwJQ1Sq%Q zTb4WZr-c8N+v6mK9|7m3SX~1Uqs1g`P{hnYm<{`~#-*%$3$~DL8l)(5<3V1m@cV1O zxN`t$0S~C3kI6`0Nf!#+mfoehoXY$P3$>R(ZoT2lwE zOo4`<+T2o&t=(8fKF2DZ1JzZo#v{V_C=;$Nhz(;FZPh~nFqXSUQG$7K$uexVS8wnw zQ9BB;2ZuGHsS17UHgHbz8Ub$FD2X(hz}TpbL~miQjyn zC{QD-B&8r~WnJo>4R_~KS{`-D)+^0!)b70lAzYfr&PiOB^d{d13+X5nF8M&A3NW9U zS((!KTRXt)oS*L7w#l*n3qZ}NFlDa%W9g3cKsu0C(y1iuv8dw(!E7_wz%IfdUX*5c zy@pX6QrELSvBOKO$GP-S%A|Ydm~-C0SHuk;aKPrth@@mdSj1iD0}_S$H2)<5Ocaje z1tSk}LO|ynCM@I}D*edvg%I|9pnMUE1l@DYa#q|k0;9l>hMy>-j4h97Koe#xbBjb; zH{cs0#u^Ki^2NCk_tyyW7P^vU%{CGr z3J_(=8fr>wg9frbU#0cupI8bW;8hg#W|q)2+YYQnX%1*nt}z2SHe_R#C>n%1<+vbB zTS8R5iyJ*beudfACd4)(G|73x3wG5KGj>h|erjzlLj#(cWFpM>wvz%a*zhcej-+N* zXo*0hK>1ocNy%30wNch)Wm>}IFf?n@@dmp&@!jJFD6C{=UNv%xsCCA#dP!8vL<#66tix*NI`x5@&r{?-GYv8jEB)c!8)cM2r(;Oj(IG^5355 zCg@~=Hw@BV#g&1I?nvP?!ml7-G+2@dC0o96ZbiIq8CTLLfJb98n6S02W#LmdpkmV= zu=R7I$%Zs%hCK^z<&Gluh&7F#mDodTlS?KY4{8{ZUZ)r%k^bi9k}fQoikPT}!sWI? z1-HGB9!q=DmGnN<(EO%R3YFDY)`aX4EM@qB1d=}<6cI~YSG;OXPBR4^Spx?y4HxKq zKt!;?B3fI^#*$reWBQe&Hnq{grWCPT3nMCQV;4PD2Jz>HJf_baBfF7KrH`dkSK$pg z@?M`pw!r%*-2JYx3U}adBhrkKdBV`P;28$M)ES@*{eR3;2oB^JbCDiuOnti7%N36l zptJhfNJZ9q+0SOXPisCj>LyLUf-VcA8rG$Cy+bV0%-x5MG!>^}! zH)S#5fsIp zOu!*Qe%S2X!-8&iYMiU^#8ZNZm6e-1B8hVWWZ=zMI+kW6m^qKrv*AG?%=n8rOVwJ{ zIKceoCIdPF1~0KS0hP7r#Xi2Jb~Vh25yv*pL|C=pe`H+Kqj05vok<__6$QJxq40en zT^hi*!+@6b2_E7@#NP!6HAks8Rx7-I{?fwKh<`RtcS}thUg)0}z*`tsV@eNMEC+HYEqC4qE%2GfaOp4y6*qv0)~OqS4&M zBCE}5yLC?Tf=R;9e`$Z zh%FKO!~(OL=&dA%UHV2!KMJ1Q6gtp^znB;HZyGdeOhqvbGUTZw7tvB03up^~-9-*5_ivu%Q(exZY&yZzs+E;@tJPOpZ+Led|3ZT6y8}o9Ie+#?24;d{dItvtJ z4GDd^y?Buwj97?!0x2g!@Rxle1 zOl9jfAY*&9yluS5lucJqO59P$&CCJC$bc5Bkb$MO0AwS7c~>Dvd<)MJS)Gc%91ldO z{t}cnJZK7{wjr`D1`a6bX*t{e(&li~mX^tY#2wGSp~X3VJ?a~xbELb%s%J6{e~b?g zI!-V-SIlVN*wS8qYBI%JqyoMu4cXik9_G@$$3O+oZ~zmVV-Xlr(BotOWwB2q#E76P z46dx8pH_6`1=z+SgG;R7ntPsEHD88e_G9K1QJ{8&;}_do3U?v(z!F4qk1sK>9!VVrUiQ>oe@LL4IH@MVX{`o z0PtI*JqDFwanmHr3?O*V;M59@uB|LQrD4A7Ve z=MvPbE2EF^0N}<{s%uJC(S_2GLOw({&peNay6gmar^1Gvk<_JzfGgsFo(+!_IEu(q zc5GC=F4_0_P8wfE$TFhkecznPu@yH*hUE8P{0$VxJN%*qZTYz0QmR26I_cpR!?mLK z;Q}8bLn?b zZM>#%0L}LP6{ zM6l>r&$`bL29H*mhCg)PSOU3D?VyM@7T}y_iI+q%Tku_v} zOT6lC@FA4P$$2eHJ+UbrCBNDXO8FI8PfgZ%zy>-XrYp#+BQv_z{u;RWj=>SJKcc^5 zjF=8@(jSNQBOwkhTJ*y0^A+7E2=0=Oc!d$)D5IG~E?@g?gE$^uB61ujB}GTtFam82 z174C7Z4B|3&BbZA$)tf6uHb_oSPC8RfUhzlP3 zh}~9Tp`t-Fws~+7dm_lKdJ$!qC@&RgVRR{U_9t2Sa~>g@2W1gzv~rAZVgZF=7Lq9}3r=HIedNn=}yJR(s6KtWdq)+m{B{ZEX5hfyKy}V9-)T+QA}70Ngn^L1snF z4XSr&m4L#7klaxI+Eyif?=7H}_(_j~61hugUzov_^*5B>c-s95DWzpjn^G?MNh(SRU&0Q!c~dci7`#{5_sL82s9*}-C(O82E_uwH3-_pk>- zx{iYNtWS*_h#VKqw2F*cElqfG}i<&yAm&$KF_Wn_IlMHc+qkU9I(- zjOm|?3Zn4pl?8;kUtC+Qr^Hc-xF>l0DHK1WwzeY!SW1ti19~}yETUzh!VQcET(Ah) z$K<$Ev^{u98&sgT@~t`TsB+1ox0e6eY!xPkpJ1@dHj)f{uCd1#q`QrqkOJ4A?tBHM zbC15TD9dKsHNarMM*C8TX2q>WAm2Mafg@Dbts1Q21|6OAm^~(WGVD{SolVKeQfc~^=nNi!UX`@U^E17*{~kHMvVC4InJS_d12Off@@Q+ zp&I2(oEN0I;M0`^K_xOQwiW20N?(W0=qF{lLj3t z$R9H^DP1Rd5HunK5BxS`700O1)qsc>AW6_sW`;i`W0;cj`A!O%P|`~#)>Sq~Ht;LA zXH$>iTIO(`@plyvRA4C{OkE8Q^{Nri#Cq0ROkD-^KIe7Xox+cBX$9%{9_eoi#*WzC z^)`{PWwq!1dT(@pU*dzNrWBn63!YEZ+QP1`SWdAEddgc4-r+*}gcxPcGh7(O{3(*9 z(#>#7&H9jb$C|a8QwI(@&w9H8yRk~^2+>DhMXL@4ZKwRN^RB&^|`Ek%K`6aPSBl0QGxkF5He^Xq>H%J0 z4jYie4aTr5Vizp$K5uwsMd*obu=)v&U=x}(f_yaNz1DyabE5HH1Q`sFQ|6@xVxN=5 z3))S*NGHO=%C9x`;*`YK?1Bl|_{mK^AmpE0Bsymambl>zHR;t+ozxJiL}~FrNf%qo z^*t4+F+Po@zZ;V6Tf|@&PdE%WXNOrX$HM$*H90BSbv%-YcQu!m6xo|viW zRba0mOV3!D08J>Sq8`1F|9}oI%6%F0vjAD2ip#5U}REQsH_ZOa5qJMp|xi ziLaa)hC4EP=MrZ!G@z}i{)tWHpBNrm(+E@{*PilKKz$qOvElil#u-GEGj{+Heil|N zjv7*p@9*$H8V7OY^&7K2EO$Mkfg*GeJwwu^DX^rc zR@JF2t?n71;BsxSWEt7anj~Vx3s!?9G60tv8u-jKVxI>Ln%=O3U`}dL6E_9^72M1@ zWJK)@qJznJEXkxbw@%o&?N9}yb~`rjdrs&y;TgqniK)ks8OxI+f^&XSSQAF*&khz@ za<{i|Q4`yRb0cas@1xON)>!{*t+XF`) z5p5@7;?w8Cj@b#ua94&%MU;=Ed(<5w-ii`_1-;H$Z5@C*wl`dxOI&kG=gex&JBoM_ zL|4q!uB}pLWrr*b(Z_;aGUGmh-vo=x`%YL5mQdErueQ(8;+r28brQR0o(pxP;D7dG@S(7N3ENTL+Hxe0=3Em&V$ z4`%GR=&6tLKu`GNey_ax+5U)A8Ct_HIu#1sxa6@XEZ!rQ^A zpx?EzC9f|Y0nv{EqHE&?jqw;MFgzOs0O7Hs^K$`YvDZIIWJCtJ!vO^Eb!xezgB3~m zOPg_B1O!@JcC9{0v%vvmpJ0qR)+y?u9NjEO0oG&iM8qpbddzlq?peUWHB3 z3EEz-6j34Q+`f2X(QIZWp|^l*!W^%~2WhNEw{p^XZSP_4#VJAJZjqXvNEc+m&9)5- z3Y3sS!A`i6K9ox7fixop>|o?@ojE%;Xu5Nw8U=cv-E=05h;G+J5YfmrBdr+o?Zal? zq*No^G^X?Pi$+91{Wu&^jvyG_*=0AQFmI|u_U`Q$c8-jLzt8~vC1A_+E^D_HE89D z>dTl{olEx&d1;99&%6$wned7}TP->}Dmr{GB^gpN{8UIgqKu}W!H8qtR)Z~+Tl>J8 zmhe4e-g!b^cmNv87g6fWTF^4==?V+q+_ZlK<_PPb8*CGMp|vBTx&U+o2$CSUX!$51 z_Rj2!^1&SfwG3mT8E$L6^D| zKIe}b(7qXB3+_Q>L&kP|JX6Ve;i(NLxMx7bnsx21Vl-0jVP7{?&>JH}3?YOF|5xnW zfET0Zai4RWFbY^tPz+(ua~xB{ZA~}3`%rp?w%FXdQBYh^<(fuS^G<% zCIuqs>}RJ;fmw`f-er(SO0;@yC8ENE$cA39J4Q5_bV)E4^|7By{% zT{Cyzz5wY*Qv#f5`v@A@Z97t+v>f?b#ZO>6ar_GgYK<)DxxRQIeI$KM18g*$ zl{|k-aIml)Hr%}v=?5}|JdI9Yp zTR^C@|VT#n)u)t1!1wQ$XH4%gX+9SJf z5teomi7KMBXQ|#@8fCn&mv4xvI~$nN$4}7QNHAjyr?0NaeLEU*L)P(FdXdB#A) z>GDY0x0fF_5;k;uo&vx(Hvh=m1Si<2-1qN_KB{A2r{L}8hM*KyD{tb?SIp;Y@sj&S zT&awHA%c_+KQ3K{!B;Ja`jgQ+Hc({(L1^Y?Yy^!kitHQPYhzQx0M>g|>=gd4IqPhK zN{nnXE%3}c;<%RN-<#AjjsR+iObhEm>w#A(e12bg0Y@{k`8!9XgG(ZiP}pm~_qOzw zbYFVPnAWYW-hwUtXs7nK7A97Z@mlC0YqL>b^qTZ0r|9L# z=@ya;>+ukP^ID4O;-5+1m+o^cwTMa2rH|-o$f3tx?mST|qY>;LZg7pAn7|EvXq_XNaRVr#ZBL z4b`(p(ko>5g>~%&N3OMAhwkE2>6g-*(zBT0oQp%3MM8Qu;DDDQ{oI~uV*@+(0JE=% zV`6!G1|+rOA6s>`F@i>Epbk#{npmsTp}6y35_Ocu8Av<+O8QvZm)6ol4mTb~ z&`Xl!mMceW+GsIo;N#x*fd&~J9``vZVkUip8K3m4R*kijwITlzm9B!Jpnw|j&SVd87d*f~(=VgMrtF~#xFl+nzn=|ph6+HOM*g zU%iJ?&w0TU={4lvg#162{s#Q#^OT`78?kvo8Q^v4ZP2Nn0c}wGsU@DWc&g;krAwBx z`xG{3s)D~%8IT;sjC|)L#pI%vID<=U#tc?@lr;ASJJ0p!Bk8Xhr1dmVdE^T2>K1zqpbJwb4Ty~Ox+4Ev7}j61X(rnw=GKtK(gL4L0;g0#I)W+jAu!R7I!tnf zx%5zaNqPVb9(ZCwruHN07nU7etLV>h+TFcvQ<&nxg31eIOnfmhPcrC+)TBmMX{ zQX_pHD#4ubc4v71N79d^A8tE<)*3KJkS`ufGDgc76mS@Xi4bSOPg9sRWs?+^GS$3A zmjp)53#@1O{Di@TM-9(@iivMeAEj^mCU&TD;fU+b{8|r5(f3N{^(sq}Qb%59Gg)-T-5H2OMT>%_Ucaekt_5*x>*& zW6LBw@`~}+QC6(Uk$P;>7VnyXm8igZ|3p+)+rca#Sy4B$F%c_bq6GlQ#D<#guHz*e z_+#l~(94pw4q9-sb|=z9$e@U>E6W><6-Kq2mfd6puekz?n2ne}Eh`b>({bc9j{aAV| z{ab@TWRp{BCW4>toTeG;)S{=Y;r{#JTD4i2&JD(MM#_8!V0Q(yHZiJb>zk+M7=a^8 z1w`}S81h{d!|EsN{|}{y*qxcRa0XBDl8WyR`Ts!rgwq!Sw~j5epp5nLK>o=FrJ(kQ zFou11r%38l`tS+odB_^Y3rhmRb!EJNZ5LkO{4vq%+_rjS#F5NeIl5M&)&{JmN0^d9 zA6O{eQoUI3ECR=?CLrz+>E}KfRAkwyF!I4oB799+Fi|>a&_QR5GTiC5{$ELNN|(|P zaUJgHYVgNTr1yN~I|q=%yr_ZON~@gLu&>5k*7ybpxc`#azXYy*L>v;7#(dj)RYdwF z)#VA_JEaIzO82G5J_-CL8?=z#XFUhLI}~U~FIEFw_OAJ-wvRBjJ4v`;uH-(l0F2NxxS{cLxP;9oZRr9;fd+F9HYB__?FEuG7@|6`GVQJ zH=uCcB>#7}igm4R;OZv%pGynrZCkv7bebnD-Z|S~aNBn4;@zLjHfnHyOFI$G3v#{7 zN*Kq1yN0L#p zX|E>RLLtK4`hP_*QEu&-xIwQCbs)vMe>Q2jyPRikNi-ibd;R*0at!Sye2c^@n;LVb zu>i1F09;Csq!Z~bA@kY@oGa;+Fev5WBirCR^#2@>pPLnNbMPenzmQ(HoimVP^Mq+2 zEjiYQJ13p=SH819y+*L2e~MCN)87jEUz)y2y#CV-IMJB2ZEV5jp+OK8i&Vz~R5nE= z;JzcQT5XfC=F(jp!0rW*ndt21KhP5{-dHp2_Ub+9`evJ9@hjfBwf_dg9$Etx5)k!R z`e=Y6C4}IfbVtgmeJAKoB!rX*B1s_3kMvux7>&n6}U)ur$M|`;kW1ZjX5Rykw-t`@AfEvr)==C zp~q1@ORi$`_o9WriU>bt^Hlh{U+-cMt8!~~N)~r!1}Ey^T^NuN|%K%E$Is(Edb{*S*i{w7K zQc6a3>R7sGC`MxgG9rkKX%4kNQ>t#>wWAEC=x_wC5i&JTh@gK}S^ZiU|9W5a8{Re$ zj;db-%T4v8k}>qb8EI?OD#onn*vc~-7WgQdF*61^KP}zDKg)c@*rsBQw<4QgNf5uG zM1J%2`%SKlI%Em#H%3}vu4|I#UN4>SwHF|>D=b;qmI_cR5S3mpQej>8DaQ`vCvMPM zP5JLrcz*}i|0WkVNGNE?SJFc%hh3E0#F7L2bKwSb!!)}ps^xy7DYpRSp+atp&h z_Pvw1HvOJZaR~H#MC>{1hho2NpT?$v2KzU%{Qw09onDfq=Zzf~8aU4qP+*V0ximPV zqv6U3s4JRH8sdSL`t%b;4?Ip+f^gcr~bpm60jlR*|S1uoaS zxUeA>b0+aGC`KIg&rsM6LH+~%bY>*c{T?$FzaWmE1L0iZ4<>lPt^yVvk1GP4Ln75v z=`l#^*jk}GCmJ?oW6^ySzB;MbePiX%hZm`7S*JyHq>ze`1bT%Zn3i6mr zeQQ}$ODFaW2;o|X8QL2A+j)J)wx*+3=WV zQdkkcrZ1_$^e@5Ks#y7njTmbYa$)27H>~^^OZHp%iJ)hWQWlJ}IhHDrHzA|mIH24d zLIdf!rVk}pfNL0SK5eaYAmLd@oB%Ee9VJxxV>=t;x8c)?p@FNJ6Lm*h_sFKnZKxHU zIgw-y-EPiUzSPK5af()BHRWLiIm!T6Od^Z*3Gzq|pYlUA;kU#C`$|mhyodBDaK;T{ zYPM$?iA-r~RIN1FE`FHsqZCbjEPdh}w3ua7!5B>BhjW{Yd_*cy{I-UlW32ZT-&=x_aTiK#cm=Lg5yUj$Lq?vN7Juk7E5FapPqjeW~c^4 zoVefaZwXBt0Nq~o`Sgpk{_C{{80;?L#|0H^g(c1~rsz~?Xd*InvzUVMnbQI@hshb0 zfCD?qW{s=8_TCMDTF`KkF1|&Wl|#*14(3?y{J0c+37pbbKYLRQ4Dz=EQCK8U$H8j*tP63E+#lK-6E*hlQS zPYFMB6NK!70Uhc|@*nV529j?+b@TXw#kVw1Uc4ZhjtI8*>}-lD$`^AQd(#v){IoDG zQDdXoV;Dd&h(1LCFrm|9W5^-uI|w_CQLd9oB_^`QW)a{G7tgsMVH1t^QaXkGTF_uO0;C_iR<(AXUTqQVh~;Q#ifZ5sO}v7ZWH2_0Z;DqB zIZ{$aN6MbSfpZb=gJ|V;zHr77Cy#s-gkO9>^Ng%;+E5P?16F#?jB)WC`NuTR+$NW= z$Wl*tc4Ef_jGR9$AU-@aEa48%c)VSPY38+`oP@fNK9)Y*(Fq(* zJ>#btyYGOEY>hFWaDDVInCg>6}>;1iP)NKw|SJNPkay zUiw|>cck0A_sS-pq`s)vq(75>X$Kx$U;MK4`_gBmBi?h0+%8do*QNhWdd=E%ueZ+2 z(pMSDbjaY0OFHq=0=^;r2kEWtE|41QpB`z=*4uD>Lo@YkAccm3j2G0%YS_?Y$i)Q! zH$lk0@ogg09z;%+*>OY#;7^9>3sP2OgUcsSAd&*VEPY*iNjhLM>;e&H4A%R;^zWp1 zTnf3gFMUq>Bk2`PM`Pr(Fa~LA%Im&(>8)Osz9GFJ9Wxa#*psQk^{(_2>4$?}n9%>U z>Gv@pjXhpc@O_#C=9IKL7D|b8+M`@k7$JVd|6C0k85=7LoTFrO=HoW~Q<`&{26jiF z>B{G%FG?@Km~NSKm)UusKal=Rdefzl^#2v<52equdBl^a{=4_352bgd9}Ps9E`Cw^ zs`LfM5oHX&TGQJ1zVs96F9&xHGh7?^wlt$zpbl#m<$N&HGBeCAN@7zeZ0e6^9DX8z zdd$XZ2%svn;i6l7^91`dlb)BpB7IRhmTD>n0$_Y3y)FGz`fKTJmqJ>+m!#j9K8va) zy?8)p$hq`LdR_V}>8-)VGwB)WYtrvZ&ojZ~%Dm55I+gxf`d{35b+dp{_?RgOlEDkocckx1 zpJA`2Cprh`zAa6qpG#-b#}OPPi}#H5E$P3IUcq(UW(w>_JoZYuCw(M+6xHZ-@$=Fj zN#B+pFcxwGnQUTnNK?1!!+{!Q1Q=^OQXoRfD3T;ge%nqa?O^5rs;CcQ|=6hUY(<3D1|&G}%Km9Q+y(e7K}yXCem zG2J=Qe20Yvhj+uR&MD{zrFUcVo*Wds4o7FqI~g$$H4?uIJmD0=Vs?}KpOd~LeN%cy zI;M0pC+@kzN$u17^e{v?m0pnkk@U}{S3zFZWVS2mJ?Xx5DZN2RwQ>1BkiIVciS!le zX%y)a8#ac1@Ibnt8Mr6^!cyjq9m-Z)WFB>bUMrDs;0w;Gor0e&Ny0L+!Nv|;=&o!i zGmqIMw^7w7^lUhmsJDMy{z-)2mA)&zY`oKiYu%R)q^0z+^epXcJrO=FeM|a3N}nTi zOSPu^R8mux^bbZ6d^ju=UH8W-P|z9jvD^i}C)_W8L@xId?x=oRS|>4yXPXA~e) zAIi)swJ||-YUk1JTU*Svs^;K)4QD$x&^Uu4;UtP4eit~z%(`?pn5q~(S6gjhhyA%L zeM9=^($|>FoY^#(!kPzPkzOVk?Cj5%rEf`JlJ1yGnE{(-O#E%77o_h=r(#z;kiIQ_ zM|#Tg^$C+fgA$&ygy*C^PGB?$s3!bel3RE+D6{j1+riQ7IUB7t8fXjkvnA(^K<5to zq%(a{>4@U{%2-_$PgL6ExS{(^_|4atBsS*a6#E~8jh1#WV%G~YmtHXK4T!w7mv>12 zjyF=xrO!%#B>fHpyk^Fnij1_Sjc~Ga1G^>2j0-ff(b(x57yNOrj+h~*2}5NLj6q%# zK3pJ?E`6xXA%ta$YGlT0kKth-Z$E#M|L3HCBz>90A-AwvcB-DFr=+J4VVy4ijPz~k z^U~DLZ<`{*bA}tv@!`{0yqWYW7tiqfwRNS(?rkw6lWqCuSnP8){aCtdplHJ@%m;Pc z)-*KwD2l!TsL}C)m}lexR6~ff4^%U@JgnvZ=fHG3EBzzsE4*Gb@`;3UMqn3^ zNhiV=q_0Vzmkt=*8D#qw9_mspl z8xCg3!H_Z)>y`0+QS&v0mJ!{SdZTy7bMe>k}KbJBOEFHooqUq9+IWRnlS>%(C> zKN!Wz6&Tpmh*>R(;N{Lbh|@lt#W7_)W2Rzh1%xc_ydv|ClINvSMK(r(>Ge?h{g0&2 zOJ9%ZwuV@(*7()Ib2^{f|kW7%hXb5Mv4BB756&%NBDS*{Db$|+7Wac(7^+@;kg~7 z+5+)M?Cx22LnM-K%Ri~`=cQ-xo?|n>!N$lqF}aIlWAQ#C-Im0jU*N+kWRVdqZ2Zni z|34=kk*t>}l1HNvG45s%wAiO$00iq|#Pv&xIh%nGZEeWSC0Fj#6t?MePqIP!8qOgs zOiZ#163D1O;AzL^-IrdKzDW4K!ILl9nwe44YR@2(2w#z2lEyaBH@B@hVMlHT(QjJ3 z=cJdVXN|B_83J)2i5;2YiMltu7GNMZbL`F-#Bgax>P4ZZ6Go66YHL@l>>P_ALydPf z5(OnV|esI3RA81<#l$ZVf0FNY*fQ`?BNXP{f=}8#C?yyF8G!!R-qa+GDZ`}6a!k?5L$13Y^|le zGf1JTB3}+$8C__ylGu*tp99Z~ShKMq+)>XcZ3}d6Jn={F;+9ekg}K-6LeA z)?;9=H3?CaX(PU0GPQZw#}f!yZfIp;`F;%$cSQ6s4+qfD%lKHznKXMwz8WEELk7MY zlyX~Cu(Z*0t&u3sfuI`f!j24PD%}C?3_`&MovnGP8Xe8K_q7O*2w;}@?O+b(gtQk> z*or~)HzBnh0l3#ZMGOR90PgfI?sWo3=1?2rUXn{x;4-sq0~~uQ66V4hmLiT0(7I!j zw`7R9nS~^xl^tag9hS434#dc~$t4k(@MjkgQ-hGb9@L@ZPe&Y27_8x7C**cA>w1s@ zHFj)PK*W_rCh@CCVs;qNoV$(eJj(DvK_VaT948Z8(+1^kFdBzU6|Ir z$FNw$u80w_6>;#68bD;&YhEF2^VIlTD^1#-iNV}PB?vZiXxmE z5Sekol2dkv9KfIq1Zbb?D=GZ*@K=^7WuaRogD?y((jabf3~G`@2#kz7j>*5ZJc zkiH*qGVt-X{R{mUw7p9nvf#HBPQKg;;bjont< zf(d&SBD_F4T@sKRS5a_@rU*GRjHeo)I2ppExcjZp7_g6;?)*TkqT+oonm zr`paZ(i)Cb?n{^(QZR;4lJ?SVn8a&47Nz&_HN>Rns6*G3?^obg!GRxJZK0?CImWay z2)!^UWP&ALn{DncE({)DSRN3A5muC*8js-BcRj5c9~pr))_ld=wPE=>F)XR^PAJC* z2|GuPBem_-VH#HN%5-JK+Qb{LkxK6cGLK~~=Z)(lvcEJA;tmL2eJAbMm>(6JR6dY>qzjX6Vck8cbm zEzql#$*wcYQ7LWlHB0)+0{>}w-_C%Jsi;lxI9D_rH5@Tp+HKa`2DCL`ueBOTg?0pY zKhTZy?IRT~>6~0zm^ZZnZUH;y9)3^YhCQ$f-r5Kid#3G~hkrZeA1L$2{BCVCo+B0M z8PK_%XkVEYm%Q)>>o*?Ae?pn_*gD1+L=R&#K#A|yBap`@8upTeXSkly>S>o|DS8gL zFkt9Fno1{>(j%20bIjkF5A@8~D(4jblWqhelp&O{fs#9t7-!pMRC(5D`z=d(0}=0u zpoX+LA<8Y0^Q0eP*RH=Af7cRsOw2-0P4i2()}?n)bQ!>gLG#bC*Uyl9^mB$!8)TFn z3*|(*0*qXe3x@+JF#rpHhl0f6f;}8LK@?%mFj`makR4g%hGmR+F$~?<0>>HsQavI; z4iz{YKt8~F4gBz)!)u82whTVjzbXDmML6Msv;J{+R(`Qb&)Bp&KXe(OqUoZVk zZsDPHK_zWO7*m-c4fk5@Z~%eTJ?2|QB7avBsreXpI}AbsNQe}w=I&7k$!TZm!N|!2 zpGcW>Puc?voH37XiP4`Jv2pmRrAz4}X+$X|*z_Y@{D{#+C)*T)WIzv$KHq}pS!h#Wk{Uc| zZc*zH|Eq0hY&iQh0_T0W?;Gj~^-dml)F6C+duEzi821PzdmULh;ljcY^TOsEL`}Zt zk-HeCvh!*i9Cf^8iQUZLCJw7tA%z7Uf`T+T!)^xtwLrkVJR(qmf)djd2;z#O!wA*b z8?g8`IIY5RoKVj-tC-BmC(_5;a|~pSU@hi4T9ImQPUpQ5Ob`Z7y(xF8%J(q`8J5W zyS}EGyk>i>E#g{R=o>_XE_5$r15Ux`GIN=Y>Bfp!W!%s1uW!`Q?K8Eqe{^2Vk$7!M z;q{$!Y?!R%7)ZJ>U@QBZ9p1-7bI;tCxB;O6LmFMU??$>s)N|P{mV`+Te=x)V3*%p` z45(YuLmhJI%7^|4!a3#AN0Q*hOz;Y=rRF=R&<(K19*nsqIrq$p1j5L7=|iC3(LysL zrdruVrC`W86SnsWv@U|!kX&Xgs@hPX-t(`xtU}Ksf~$GhXjJJoKtqSy<38eATC5V` z$??uUhIr1JMx+rB6>wf*Mf~}IMH57lecY(?zj7?$CGs!)wkwV}To6ZZh@{43aC6I= zMmtFmVTR5OfX?{};R7$JhIZb!MaY>QA@020BS4wb=+#{jyCN=*1T(GtoS3@C85aE` z2Yc(E14JHSF{6WIVkMaciQ;rn2Q5K<7lc6>gq9=mExWX)>Dl z`}ZKAl=;tqM@npHfjz5?OVefHts%2Pb(xWcEwPQkQ0&35QDB;Kkb7Z_9FZz5q*G9o zwjY^wIxr`fOIJ<}OpWks)rZHr2tvee3Cv9tf&;*lr7;~eIN3DPyLhoZqt?_$gmE_Z zjw0%u4H#5~l8XzY)YaQeyfO8{2@WjkcngF-;-xy54@yKrJ#IP6Ief@rBMxmA)eese;~Jrhvx6J?QtyVb-~ zM|9|3(GIvb@ZrIHIly8!{dcLYiz`s^o&mk2y>nvJ@imO=32S`ebC3u2XT}z+i2+mY zdEiG3w2jCc1^p6y5 zy|jS049Jlsd%Zo-Am}mXsv|AHz zSlkT)j3}?GV@(O7S~^Z=IR2IsTh*T}*YSjwq%2n;4>g6eK<^_V8b;7T@xefj}r<7LMV_#v|yO(dQ*g6G^2}W7GV1&=RE2vS1IL7pDEMX4sKkWy> z*T2sCaZAe^Cab!zNGK}qW50Q6VeXoEdqmK%Va-=J#s0zp$o6jo=Ci`F)*wO`gKr6V zWr^@|1C3j5P*cX*c=S;iTwk!9Q?j0=X;du1=q3kH^DRjnPhnW26r2q#Pv~C@6=H-H zS`rCO4B+xF>NNL~xMfWGwj^1K=A4ECn07kgmSvwAt>m0)c)FlA{p*m%jLY`OPHJ;w zg`LQln8``+HeQ9F?dwzS>9*^XcXC0~$v=l0lvVOQvO z=?b&~daa3(F5u8bUr=saWwL#Lp`00wqTa9+J}|hcJGu$Rlz$?WjE5W)ew#u1kV8ay zmN!qrn-M{r5nM;0+V1oLEtb3{>ni!Wg@H1ozG3OqOTfhkb{6;C7&#fw+sgohNOQ+V zZa8gbfGF1#H>Fg?9TsGvYyNF!#k2YMe- z!O94NL;VjdW@1#q=*GwZbyGF=lu1Kf5?jFfsIB@Iw+*A4)NHbiU&~f8ZdF>!o73qv z9~iF?!OAT5#HKh$4K()8*?fm;jEqTr!Un5;P;Fu@A*tFa$i0+-AtT5O(hAW7Z# zcTvPU;1OzjkqhS{w3Lr7iR&{0uh@MB^`Qy!kSr_ z6ARJ6amsM?mRvy!?uwWri^b-#jI) z+6=xTFo(iI*+`a?{#;-iE(aXau|Wd0wIamDV>D`GS_KoPh*zI1v$zsrmGAA_%-~Sy;anC{z^=AYkz^ zPBe%>VIIcDrRa5WILCd#YX>7_iZVF`WCstuCUzNNOd3nYf)p?YW?Fa(nGvbXtzr@= zX?#V4-5I<5GgHnD4XAgCgClmVte0lTC$k`A(8@8z$Tc{|<6t2PFLwnSeazR*Y-&cd zUzE7Cjz>ST)`Z#|K*SoY%@d3A$}os7;o7DeY51Vr%Hn~??h!$EFF(RWP7Eh&iMk_d zUNTF38H+bE*A){i^1e&0!3?JT^YVHQAdsJceGX8VK;p+NO*>HGlBYc&+0TiDD$Dm* zq{w4mKY`gSq=(R$3kwEXqoQwiCNa)*5Bk6*;Mo;lnwh~LmZ8E8$09XGVGB^M+3CGA z)?#d2(S^-+Y%QnVoxT)=*%P)?P9zsCWCad*h(*SFRHSUta2Awckz7VL(Mhhcp3mEo z*i@%fv**$sxRBlRm;L}1FOdA@1#IfMK|3w$v~~m9_9M-RG|tI3D)f990Y~}p3S=^( z;V5fGS}?b6$sI^&ZZ)zEdcR^g#Eh^=(%xX8WYO=K+kn1~h+qT+@`?1pU>0*>RksU} zuYg3OG8={Bd^?GW8q5OA5Kzm6=PY>&K@0EpP>G6G)cFrAa8Hi0VN0D498i$nG9sW% zx&>_3P&jG`MGgiPEE(uo;<_sff(u&ogVfU_g0x&yZsCuyQJrS?v~=4sbQxlr5uh*a zG>otn)piBVH}K37k8m{ZMuW&SEXB6gA?5`DF9ko4gi>q)L4pNZZk0>gN*EA%vepYN=$8@KRY)sl0s9+A3*ePOd z@oM1Fk;F7WW2ZJqy|Wn&_uPj?6)8hFs;I_`l^=c@HcX_c5pwTIqP_cy*tAE)i}ZhF z6BN>49I_W9rzz}-u2*sbG*FW7fZ%b@^vDAF@SfX7HEJ*uTM!>a&3SF`R4~RNl~MXGC-t*W}!f4Y!Z3 z@nhrhwpPmQDmcC&$Jm_=6%Bx;xygD40C$Co6b7fv@F)9r+s+I|f>D}sta->t4RavX z!@)RT_nWbq3h59>vo!Q^vJ(T%`u{h;r`(pK>5oLq2is6+`HHtK`RlROp{LuE*mBAW zDO0*@fn#IH!;JWPHw(qNM7iNCk(4j-2Oeu*zwn6ECj543WthlMs(yZ;SiFMZB8dNE zN(z@i0JYh(W~UZK$#-R~t1Cl6SHw~EzzhW4F$d{}6|>?qw#tc#V?uZ^-?vf+@%oLjHBu($;pxv=qc zGW>DJC$@K5xfv*`tV-Qr;fF-PB$7Gt!J063Wr=5~Y38pG-e*SDxgk~=aY91MOmA_* zZ@2k77EnR>yyh|Xfj+PL+q2-`T@kXic&)bK%Td8wvyJ8h>zQFz?~>RTebsLb60Pm$ zI}A`mjqWue4(i2?HFIxD6{3jRr@9IT(_b(nC_CBRWna8ynl}obALPWJW6> zyb=En269BrlR+Mrq1j{WQzCxJ?hsDhKrZNzc~|uM}TTeXfqR1?DL%mB8l+%NV<)c zx-bgc6d+|}oWbiI5l>q~bBJVdjS0B4x!!XRUxUK5lHQk=($gFX8PR<3C{|YD9)2qN z!%qkakMKCVeTyD&m5(k-JgtrFbYJMhN!HqR@H+c z1AwkAkC_>tYy~QuS^Gl|f{RfEDY`o*WR}4(Bg2_@u$Tl5@0`6hH3oR_umhc&4#I#a zV3bhgrWSWZm2FJ~+kBiHJd(aMa%vO=b$1)mesLS}F(Z)^6j7s0L=y?5U4KF_SJA>g#x&HzY6`nL@*3}0V?j+HwLFe%o&CEa23uF0lm z^j61TuKhd-S>Nx*%CyU^Z5ypT)lU71#6JT|92qBKL`{0sCnFDQkh1h&c-$n8$C9KY z={~dNdd6!>aCSr)ry=E*^#l#82R#(0CJ2iCyutkhBpj_pT_P1QRG^fOfmMu92@?Kt zaH4ZibvE`q8Lt%CO~!kk+i=F-0tg~U^dw|tFf}Kd#q$r@;H|-ZiMerRg!ph8g%L5X zpEC+9OI%kZi4_*_oG$+!`t3JgaaQBP@{m(z(~W`jWX7<&cww~Gkm9MGOfp3fLH6k| z=&)YV1hVdj?-5N!Wp_xUj17>1xjfaUw!}6H8t2deY^Wh+gp6aGjJPHOy^zkf=ee~+ zAK`I>KO_v}5n^6!qY&Xk*S2YLgM=s8ZJ}LD&w$rzeR3cHzyEtkL_T{k#ByFMvgg?sR52F zE$|+J!p?3gBgs&d!Www@*~n|uDRU9F7Fr(|pLc`nno5U|G0Q$Z{n}SQV*PT<;1)Li zG_sisW47%0Y;4|e`1;27Y`Xt!`%t(sI$N8bCcLDQK9n}l$(lhoNMMWScHBm|$dTdt zOAAJ>Kb;uPrMINFqyxI)H*D+$jyq?coG>@@Is&>PRf*7|Kqb$a4A7X}=^~)C03S;a zIgzL|oMXyn-m_kT)y@=HBr#JnV;K#U3tVv+(2j-`FlG?QBAsjv(u<^a#5-PZfavv& zpsoh+SlRItErwx75iISP_oYYD1JjI+6~s#-i}*+7i}$AV2Ky@oo*zg{C}BY@i8?`7 zL<$b90FYbALph2Sdn0;dd!LqhR99rnBNi;!hq=iwANar;sYDnOUEwl+8Yp1TfUjw+ znHyHVA$?dvsu+QbTx{!4av&c`C(=`BXbBK8H_0`|XY2x)OX)-Dgl(CT{+!vGW%%aa zAef~8?@8}SPt$u*lhaJ32b{MxvYwV5*0X@baBl8qiTu~d{EQ#=l91e%I?%qk#VTWv zuYI(jTfbE0dmuf}n1?V?!FdS+X148p!-Tviy&)ZP@N~43MmBt58ALI-_yg%s;T`Qx2+@JC6RLy}14ivzMrG(K2`$UEB1at& zBDF>`PY_r(7-t<{(eU*dzuOQJRt9Rfg8;5`0H@Lqq`OijeT92%`bWeF!F(uv072+l zhQE;hqcoGgXakTgDGP6mcpq~U+PL%2q#s+Gvv`NOkzfm(vAlZ^IJ~rEY)EGFhn6;| zvKIl3QQexGjG;7EcD{#fP(dfdNPHUe5(_r)j1457VzHe?CL8jG^ke4H9nsfS5>xs> zj7QSr$b%Z`Li(xn=hD9PStFTDu?m&-rCr$3C!OnmTYAyJ#R4p51r2m!BSLmKfHlVD zfGWooJ3d0hmaWnYVj_hzy$&yMw`NYyQeCM~wD|ibEbsD(k&EbYWAlbC$iu z=`{SqiLDyQ|6F=a`aWe!87jYq0(6FGhXK%q^b6_VOOo#6XrIRf%?vqRTl$}c^?g_R zg>+AP+ED1A(VoNYjM`AU(}%Sdt4&PjqY@DP4GV8%3luq7%M#-`vQ**#MZ6mH5X*QF zN5=lF>_jyOr8m1BK#Jhrkp3-EPr=tj)#3_Z>|FX#dgL##l-`p5Qrd?>9s{Hzn<&us zH>LN&d!z;Ux%983M*2p-W7V^XaAlq7Tn2?%iN+3wT(RR9fVs16ASNtc5CvncH&)83 z**fQgF0xOg52dH<{GpYN29A(t^MqcKmGqYM7j(#mNMw6wGlhdeDSWN*QI|i-DLu4VMkcihS=AG9_qmcD9l8y8QV040@7eWQ^fUt zs6d8_AA{2b9JgQ~X)NR(X?{Cc{G7kI1;{XA@hXEIXO`>r9{x)Dh4inaR{E!m2KK!j z`#C{#JrORXH>Cef8cAQ4UXk|A%R6yB<;sr+ZRqbvf5F)GTh{#M<-;|{e4X9|L1Z|Z zG5ASidM%834np2T0n^@&h^ZoWofwRqOz){TW4EOFk?Gd34gv)V3SViaVJQEl^tSZZ z07*0B9R~}kw2bw!YvEdYNBVPVDt*IFbE?Sq_DCoa-Qj)8Yu6XQCH-ruk^WeEK$J5^ zVUz1QmiDX#bNKub)08Y;z=o9-!(`T>^PxG`>JYm5DPeR*0U=D&q+ciKoLfLu(G60V z(RXTE=fHY3fHUbeX4_v{B3>ChK9W9?ej&ZTT&q((AM) zNm3;}l>VB9{t~A+O)S+5RO~0xn|9nsZw+czrl$6rUT>cisIZpS9)?yy(+#2LR>;o@ zoYvr4SH60IZH>v4Vsm+_9Kdq1y#dlQpGrR>_Fvltb-w}k1L<$1UvhNm^&?zLuSNu5hSlU`%o?DgU$AN*%hCVduL z6L3z2-+V{ZFpo5ccfd=Q&I*a@Kh3Op8WJ*OF!XQ z=T85>&pgE-UId&~F)Q~y>BrJre&?{k!F4{Da_Zoo`)+L8_qaq6>rQM+;5jz4V0VR5 z4|yCB8DAJ3Xky0@oRjrknXA^&0fY_usZ>a>aX@utl=9+I?{Dl)rnREPbfO+BQck_rh(nr!?N(<>Z zBHf&*YsHlxNk5m~8pPqD#%t*f&Nm&=$dw*GQM})hUK@xo-TBX@N7D1)&W%lQo$zXJ zNI!DZ=bAJ^g%%9$k}=CmqVzFSwYqk$AYxQj49I{0C&ZC$KP2dOUkQGgR_>Hwe+-y9 zMweHEMwo5=|Dlvg-^U-WY#L|Un;%L)mwvIW!XHaNfRKI*ryX>RDQ@Tu>8Atvr_J*N zX@MD=Sy$|WsP`l3m(m;C`ky{Q$gd2y9%I1Q#AC-0gt{=q4(zm+l%*5`kvHC&+zbL($^^g;K$PI($9e=d!N?o6AJ^ZSLQ=2zIup41_Du1 zkl%+B(UL*t*r?Jg$OG(SjEfFBSmXyQrX8-`cnurD*3!q)Ur9f=s&7!A3SRH=;869r z_yg%u`YE|&R3<_PuW(4iI%x6%OX+n>5=(>8mb}~9O+4rYUpz-wDjecSx^3;+YX?AE zQidfn7UpJ&auzl?bi+mxA?|`@tPQ#jJTx5d?g&Wik9VX~JM=NPe$ayV+Q9~IDb8F< zKPTkL;gUo`Qn3U(E`CpXEd2z`rM1qx9GK+trg`XNEB^{x9eu}V78dPE%K?)u!T9IW zZG`70X(8>~c}Z7}f<*GWFWqCcD^z90LbN<)`-DcG7B>5Z^gUz$HY9E%XimH9aBJxk z(%i9?3!=!80py$wn5?n$QXV(6N?~KY`Ad4{cj$lAz?O7-?X%_=#^$IDW9?{&IkOs% z_=m7i6BahhjwU11{4x-da6L55Bj+ygJ=x7E0BDP zQkQt5-Xj#?baO}q(M!8Qh4;)K^;l#(`q7r)UV$6RoK%T9UgQI z+?Lzj;;)^99^`hPcCgsC9I-Btv&(;Kw2)Bkl}|66;< zFh=!FDtv-x-vpI;44d0)BO*m zJ@BX2WZWx)=KOSJqDo=F3KbaJxYzYfTUc9GV6X*bBG@g$l^L(hV$e|2C{jG>;>=dt zcTw|X`xDRBV7)~K8*Q^2`U0=~ivr<^M*G4VMr#Xf()FhfLC3PM;D3#9`jJ*Uo2Eyh zDtAhv+8Lnk(&rt&i~RjJytw%dt`x!61Vsx(BA529AMOIuZzuf(s~r>CEDSv#VQcnr zu9dMs6stG2gXRiy#>je-IIfUu`2+*>tBCMPvWSbTU-!ySmH(&Tc4PGS2>+8kzfa-u z435^02y_N#C3rp=iC2c?E5iNpP64IM0>F-46_v4|p=M8hmnTT!SCW7BD_$`8qn4EM zH(b26MSTn2e~xHoB!q?Cuj4U%Al=(KLM_2wV{5n`5Y4Z5Qq*^5RsdX*kk7eIFYfxy zy2|-se}+*xy9Te=k?YzDWS^6b@iP7njU7L_61Ynwmqpn^AtvO-iZ0 zbq9NXiB;;F88g^-6&0Wj^FzXnWj1YpybU4!-7mp^TKUIiV}n~4cEHr0HRwb(FhtQt z;Kq?q#G)>Mr?0FM&{Z(Q`$VR&u^rdY&$_Vm^V8#rzt!hI`QkZ0w?S3C29{FXDP@5D z!8DE17}JGyyCny(*FV`dbKzGcSmyv^4OQm}#k__x!ro0Z*8jcb_qV+`AUK~KmDyg1 zR^c)A?M@1{1&6tj+DcHU^iU~oJs3=r$t{N3*@$sTpuV0%a@6LZ{aF9|H;27@*A4hk|(Xk}D@{rp%5BD!F0vYrNGzoKMTX zKQ`)FPG6d=A2GVSvhA-}8-XsO=>;=6M#fa>U2|mTwK?sP4SWiZKij!m090_f)xn2 z#nxZo$15Wujc%#{|LTJHTP?w_8sGeScMc3cC_FjQ{DgEi1{F-UaZzhn9TeQuuIMlk1A2}``76^UwzdFu2(>Pu4^ zSfhf!F&5sHBp6M-iAp&;z}0mtr_Z)R*cF} z%qU5Xt+lngVQ{GmV4pRZbt#}Xe%Ev5bh>eMiY4)6^XF?q*Axw3MBXoQO4 zAJIs$$5f`F#*{418f92mGi_@={pty`L|h6E5yn3Pzj%@eV}EUX?@wR6owp4|xDh`I z)jg$$w6fW!D@Gnq==NQC9V_yvDPO;2QCh|VL^Qe?w4r9Ugz?O4d*C3@*2YYK(#NH+ za|u3OgrUN&#jS0D-Roa$JzG3Q`{X-sV5mf=LyKIK3QziH(f1l5)BhU*F4y+{E#T(` zKX2`@s_SPS^UiZi7HW8lkEKswb(T*U2IucQSc3Lfmw&cPicvxS^v#p?FCJ`x8;b-D zp3PmsdcCcZ~CzJ!FZG|JeHi)Soe zJfWAe7dFtg_i2ory@#!+w#J8+c+z~Z1#%nSRU+`*;DnJ)GwE5_tUqRS%nw6bt{8GS zfnGd*viu9=H?m+d%IXdFXK|DMk8NCI)R4maZ>$wz_vV?Jq0DSLcE#)5xOg$=$0j?J;va_dt$%gLGo`a^nHltaS{qYcb{{ zEA#LEq@{mWdRDr_Sc#~w=F%l|Cw9yKSbAA{fnaNl(g>|Qk=~VFlO7G!xRmZnFH84q zaN!6aKBCj?E$K}7MkIbVz@RY_jPi z8u5nupG$|*^V0K7MGSFdz+smA)VaBByEiJ-LMN+sPhIfuN_5e zZMIRxnvbw05wUCt>*ex`lwr+uo;?FESFV1&__LOR^o6pa`Z+4`HORQVI;C~i>kF(btzI35j3S%r4!TLpDcf+{EUUj4ZOc&OIi^*u1o8 zvNF>h)P}ixkA`5a0RkD(jS-+{`mPltMt0XBLkqtleHQsAS&V^H&!rEg?@Rwm`td;j zkEAb4e<-~uJ)jdc8p%=+`ib;C>HoW}|A*4&q;E-ImY%g)P_fTa==B5Xf0KSPXhw|; zY(@u$h~8oqPo)d%(dq$Vjm>>b#v#g#Q~JVEd-q+d;aq!V?E!xowmRj$ZMS~#>3 zgI-6kjsnTjgq-u8)rb1%-Z~j*Lv-V3Y_w?U)6V=Ym?K8mhS_&!O_^kA&(m#`4!X8F5w*X@wNJrAU(i_ptolD2k zSEN6c{z!VkoJJJB6BF~UluI8-AMQMv)|Vs0F{*zQ#n^^!^ditejDtpdU}CRvS8F?8 zH}mV4OsU%^Yn)&P(>`i}H_(#vT69=X>PB>Y)v&*b06;(b>76Y1O1 zZ477t5jjeIU&;mp9YX*AiS!-m8D>95ZDY-oucdp^iS*uJCS+z1e*~ez;>WYpNcX}I zw8>}#j^R^NADC=~^Q8qw*Pkd5UjmgbfSbqInk(*+8H_%R9+C<9tn|;NKbAfNfxBcp z+68HSV%q)E^*=ZL|CaPDduC(-z>2Bo`_h{}jV9$@$?tqedIjF(T}jd_(p_e=<tbrBDHWVg(<8%XgteEJDQyfn3f}XuiK4|e>{Dx)QhiIRzAr7= z4o>pjDgsZW7p3n?-@^JgEN?@P)CNYzvkd=!l%8iIJu7`j`XlLOGhOOW2FL--lMVY= zYEq2xqVz}7A4@OcGLOmJmr(bThTM{F`4pM>LAmes38HfbNZL1DOGDNw#;)LK#cXNq;E)f%H6sm}EXM!wlro)6%oj zf%Hj||5$of`XlKdOP?d*6A`4ievih^+b+Vi@_#ISM|u(T@j^Nw_H97LZ%NNdPaCPJ zXFx{;=wozU@PBIbF={|P7DI(89b4~4M2#!j|5I3XHUO_OOS7VIIp+->0t=*Mqpi;b zE&Qf*mqSRZe(CE{I+6CJ+tO3gUg-Z=dPe$v=?|r63EZwYBYk8wlsV&Cr+5WTr01nS zlKw<`&H(t5KxJ&bR8LEHxAi|Wz;E6Eb^Iep(hKHQb<;FP)?PGbfL<`#5~y&8hpM-S zDXy80F|(n>g`H!xLiw)-7_`ej>Hi-}&vT#B-1wpOiaag7BK@WGFhux*^d0Ft((|T& zQMbqq7g>OjUSB+wUY7o`^vBY3)LWC0YZ3cIdYXkyd*tJR1E|CcR`ZcuC1Nk`P%s*>U*h((}^ir3coCwZztiC9FxVrcYoG z7e=^gQ2I5*@hQG(HN;OU004jhNklV$ZKPV^t$;APEI4&YlL2blpJka%v+d>PX1q%?pn{&$T;1?X?c}Ve(_hN=OIi* z*M3bIY|jR{?0~Q$4u8z7y*a4-lmI3JkG_H>-pyjFt=K=}=8XZ!krr251V0H!yP_4L zv{fvb=6S`|SZ`;*NeT%9en6SFh1eYfrY=c*N)CwcOvsnmJoA2)Voju9;Ljqk$OP>4 zCF%D7F4ipgzHyr~6mC2?%PZs`8SMfF&B}J`*dqJdVvB3x6?W8G$!;5yL@ceaSsI~4 ze*Kg;A6Y$TpW|JZIOF|+_vm2QG4t39puaiRuV#5`J5H!a1ekMFals!IXhmh)tu#ot zj_~6^dR}@4TXV#_UZDetNV1OsneAMF4fZv+gmG@|5EThQ#SeSUq%AMDq82|R%!$=o zqOGOBeBR#xIYGw0F)i;OjFy5D&83|kKaAi~?dt{3qL= zJcIRLvzNw%1QmsiAj>H9CoX=Ppr^3WT|v0qY zF;IiW*)!VJhSo;m%*WePkTMH^<~Z=e-mqcD(K))fzpejC{x3<-Gjhdg6@`tvTx_FA zv3Pf}^3icygHDy0u&EKLI{fDu=5NpY9lM#GOn%N}n8kJgkwE}0*dhBs(2@Jggja1( z1WVFdv)jYh*G!kHX)gMZnS*UCUJ5SeW|c1(4pCrHV;A)bI9(_gG{^%>Pa}M3@unxB z>gN%J1%@~!@DMwBi7^S*eKp`)6Q|h zbm&fjg2A2WId=_q8rwMDn&7I#OEPOsZa`eoLRiMPYo)^dbM`E zUbl_L;JXLpn=YEl&XZiaNzV*(iG3DY7-Y=4f5_&J8d61#|31h=%R(*}3Pt}8%<&A$anO>dxuo;x6fuml-COr*W>Xj4Femn)=*sZnBU3vg@VvCC1BD$gxv}PHg$`eGPSzpS z^i%0Wqv1^mOip;&>o*T`S>hr+5~#pTLV0#~0ta#vRIzjL!y|x{)&?T zWv`DcEL~cYa)&>*q^3)Qz!t3B6mhILlc?CKLM^r?<3&mg<;WU|#O7TL27+$LeS=uz zRh0%^K9SxX5MVv}TC)DK`4AK7w$$KyF1KTd%v5F0E2Jdg5{nW@{@mIEt}hOH*vu$w z34b{vj}t1?qd>JbFg+?x3j~Vbz8VHt@voo5B!3q1(|X7?9|-4fMbz z1Ib+CfhM*GE5fBbBA{RyE^Lf6!Eag@rw~PK7_wb}!KwzWs(T zT5>FPiNG@3Mcs3*VJb8>$8?1Qh!SidW10S#vudoOaOpg6THpf|ohV2ndWmX=@ zIm+ia3jGoKRgh&;r%J^-%bG_=c64_eYIFT54C`LFiN@c~C?8x&A9^EC#_CNh*jYhG zh?r;$wI)WhbW2}YFGS?YY4w(7LtCq_4Kei05T-HL%nVF!;SjFjtX010_o%Z^tS0H| z)v#&u!8o6Qz-F{x7nZrMunHAQwwni?5h>lmWlsqL9!ql$v>6kYYtYF#2!ERO(cSf$JSE1LUjekvm*u0F#5-EK1N8PVoS{k9Rz^c`o%L^G4@C$O7P&u zw(j76cJNwqoimFyR#w%C{50DxqQxipHx;DgyRg72%jGM}!sSw&aNFF*G3UI?j1AZF z)}4ms_~4S?6~>}ztslJJnf{79%q?XPB2IXM!a_RN@$2aRl4P;A2IdT>S>d6M2#b5q zKQ#(bRJj7WUt?5``$s*M!S$U3sDZl^ezF8EPft1Sj~cpQt8^HyEOy?&+=#+$EN|qMUGiDc zZ!ojAmxtK;HK_O$^2ME=ny}*e1IDfZsP-9}ILrWo zIa2nSd{f)Q;j4!{=m_GEn8&q7<8muS)s|xQbTA;12y+6e@TQQ%enG#(&@^co)7S5_oR9g^jpbV1BLqBhh;tlS z1M8(3fF0$x5TR^J*AaanIpwZ7MbHIlNvHe`&wt4?uTbh7OB7z|asYvjaC#-itThNn z*r22xtDX2yR6<75p_D;M+_Tb2^i;@l-Rn;(JVzzsf(){Oz?%`WUTtIeS)Wz6Pux-T zVaJs<$qx-^Yrs@XC9%baMnN_@IHt~pYO&P|=_6?+Julrd{>~Ue$J#n7hXGuqmboDy zHNPEOoH*U-0CL<-aMqT5!lz!u?;OG$3zLON6-yP#mYjyiWGRCT@{qZ z<<25fYhO1(MH7H}haDT;f*Ndq8iMsLmTF0)ydix_3VgW}!*rr+iGH3+CqQHS;7|9Y zyF2>;DyV0roxmfDAsSYy#Z>kpxCME7ONnJp1b9JlyTNaCp+^RQIyN_)k#!wHk96+; z4iqBNk%rR6A>Wnrwt;z$2Imn3u$5sk#uhgOTcb8sWW@B>&mS?-n(ZFsrLZK45_xq~ zCv(BpPXXM9Le~iCrqa))#U{NvbV8O7aFL?qTBA_`uq?L$>=~rhwMB`En_I(+8cQEY z?-5lD2A^6B-G&oh@7s|6%G9J=ym)5ek4r<&!gZzq z^l;!n+zKVa)3HrJnPT~JAe>7ECib*=YLQirB`--IqKPGH?Tz2IfQT9TcVOvP#-9m) z*7ZVmHGtBtXkbPQ=h%|PUGYPC9j?JU<<_|{!6#YoM^7GHX&`t zeggwK#g#8Oh`dJWYq&7MZt7*>Q3g@p5(ejFWb7ETSoseqg))pbd6MHw`0Go_| z?2;Cm4Y5m9<4fbzch)x*9UgET_PJ#DLt#Fvr%r(PG=O?IqYi zqa2aDw$@mjVWCEbly;90{EuVff2=4Y1sz3joAbdVWZbPZxVqqfdeI^u+jFh(ho^)G!4sv;i|9VaBF}%x}ZMBk}cXd+@DIm0^#V{53 znc&IRMiuT@Kqawe_`uqR=4~uxn^JA*8PL$eb4!gPN-J2MbAZ$x8GwWFjRmb6AQRcv zIFUXXlrf?Jk#2rrEW8!;gA!%$kvA)x<9@#w*A`-pU>@d=wNYl;nRdjbC2zQD2Z2CQ5m|DK0=3jpV&r)0nL>Oe+@j_>!KTc z8nkp}SZ8U_%!r(#@HdbAId+L7zA2!r(S>X$gcUw;L>#lWoPSB?89PJ+H{M!ySK2C8 z#?4#uAljB+VWophNYG2^4v54W>vJ`rki`8Hm+K?i;nq;lN`o*XR=nC~f@b)Q!sguu zl0P9nYz91Z8SmK9??@JEi^F^5t?k9HjPw~)yR?e8*c}_2G}C+jhDdc{Iqn=;lsJQp zk!0K0d22fpyP=NR5|f7^IHUlz1A@ze3CER2ZT~A2E~vFU^4*Ir6o%h(D)OOB!4Lq|ujtv4>h6f4|fk*)|`*Hmaq z^826&^cXBr?z#l9s~Jf*V~wJK)~Qg>0aS=?#@UrYO$gYhK>Ibw!j{q}(mhr=n#Cu4 zdr<#+tpJ&k*ejcA6m*mg2~G<-^fAxX6aA<^za+EQ_3skCHrPGaUNeedQH|S>xlPQQ z*W9do`GmV3Ny2E)t@XCC3}|BpXg7=-z*I?2H3EW&B67{08ox5tP?mH++(jITz}tXL zXE?xBXkSH87%2DJHeGy+c!k{gINJ6e zt>ODu?61HXLqM$|yu(jQwvLFJ8+L3khvI1;Gs?4rP>Mli%eaYUKR%vqS5QS5u@A@^ zDQg7LjUD1r-DE&Z;)4MCR{&&FjAk-2XJ7<9BFq)t--lL*Z^#KIcC7P>bHJV3)c}!? zc#0W1TXPu1Y-a==W;{1t2!jzkkTLi|HrT%vG?fy|l@akQ%z=jY7z!`yBNHRqhEfFu zaB1CfH@-r!V$Uc5O{8NC_?m-Bx-hP_2Z=O!NxnNZBOiT%JrYf1K#w>QHyOWES{NsG zk6r-Ts{u?2Rz@Jc3n)z+ewbLMjiU(3BYiBrP6`}^+ACwB?13;2Jy^?+PB3>P9B*)? za{guoe`f~@(HNgIXC-GOAyGys@x$GxCN#QWcuNlSA61qWes~88H3l3E7`TQD6P4x4 zrcv%Ff>mJM7YOVMjUS`QS5`;wp*Tsu?%{epYOUZUZH(>Kqu>M!Vq#x6CJz*Edtv@~ z#|Ez1C@r9GbRV9R1WXNo$_6j*W$u8>3u1`iw}(IPu5So@uh`i~ScL{uI=uhLlv;Nl%6{^9qGPCC`FQ4y}|J3LM z5$=SZZ-`SOW9pR=qqcUB6$afe@QOPE2oW)!OPSRg z3wWEy_|WBG7E;DnCg+;W{X?muD(hYG8{+$}ku&_JK!`?Ip*`-i;GstL$L0VsC|Y+6 zNiC=vM#vQEG8-7sP_zqB;>J!jOMh@+JgMFW4v$tLpE(goKsYUFNksWW7RrjyWR9UZ zGCXf%QCc)Ybr(m;?i^!w2zem5sT-1`YWqx%9E%xPQe(JSj(~4jhw{*YRZwRyxqe|l zSghM}rxZ8X{X=8Wr_YEVMr~%gvyR?cgF{&9667hkxwW5VG`PNj&$ec@#_Xgcn^dw7 zm!dO3K}0H;B3$)(=&jL)7MS|Zhl%A8VEl%T)xyR!t#1nd#Y?U&*e)&njCQVud2SPT zqr&5h<(c9%GK*k0gu)xoDFUIJk?E$`?XgW>3jWZ=O^s~1b-NWr2{Wr?#YPaCHUxwB zIIV4A8B{om3jJ})QkOvn6{JLeHvM{c*w)Oa*0a4cN#UKUXW%Px$#9(4+r`yKM*FQE%M)aAp5C>R}a3j^ggv1tD63Vw$11;&& zx)@N%g1eCZF(cdzQd{(I#^1DqW$zxqEmljIvuE|)77TAa7>P7u3{*`KC$~6!g8LC` z(_?}KUGf%M77BD}BTT3LsV_S=uqb1+d}(WRn3177V-sF?{fs(FG=0uYp9*7CEZv6Z zX38AQk1bi*fuId3>n@-dUE>YAWD>AjPn`5H>oQW4}N6{_-*5ZN_#(xIdxH0q6yLn-GU5yQy+3u~V z2Q0P=V6qnu6ypGwQi7+~wj&nXDj0D4rS)=6`Bi3!&M^SvHMyWRdf);&*3`<+Vu3ID zZwCubC~4#y$%-s&l3C!oavPlS!V@E_tNp0X=X7Itxn%#!_ zKgLL{&2?lLpJ>DjdTbX1M6G6GX?|h6=cuG?7)CJE|Hal31(7CY);&G!gf9nTh|DW9 z)_KcAhiwip_1XhZrMuE|(sOncL%}kXX317IjQsi&HG#tRO3TuV>^7*t+`D~skU)92v^2pcfM0WAA$ zrIc6h{$!(@wU)9*r3#nup}ZY`B8(?|n0LXEt6g+sOiI&4%YpL&?~PHInlR zg$Cc!5@0?iUrWEf!1buVe|j9`jUBX@(S?hYE+)$a=a3sJz?DiT&HMnfOCK}O6e!?qF`?E)v#zD>DL3A-(m&iqFu(rO**c**` z)0U^|QJLpJNYObqvoNp4&Nth8Fe2EMneM2aM9DOWK+}N%h_7EmTM54vk#V|s#UjhJ z>iSj3maL>DURc{pi5iVij9v=a7{g^^&UD4|w>W^}rf&GGA94gYHksrL=+pb;wE?hn z$#-&>GnBDlodTP`z&7`?*bQ~fTjncjnvr9OL&6i!Zb||I!;MPs$21NcSx6ysbv+_j zPR<==-WB|v;Nb4##%_E1nVD3tFfmiyZH@zd#89E#5Q?)AMr{2H{E;yCB}Ht{`?^wY z%_Ah7D8rLaxgf3a@%Di>5y7o&E!X(7ph(Pa%79~Io$+8LT2$kGY7RsxEZn}{CoAfO zb8EzEED0YW|1fEt@JH;teIsmT7J&wyUhaJOpnBz`EVXnd$woufsAd!fdMbP-oictQ zIv@8fxF1nZsRx=@(sDAU#N(X}ZmKeLeCPwhjpta11&&JnEPI7gY~21iBs6CDc6@2t zu$P#IAVFmQ-68-<8SF8ta%EWJ4)~;V02{o+IVLt(nQ14S^{w5I0~g?G(zQdbtSpY5 zQf}$tM}_r0=0*Y@Gqz_aj34SLWKq|Q&n{Aj399YP& z0UIi9QCC0%`F8B(A_%l9qRUHK+h)u~D)DWb!9bD{*AaBX8F+fcyOFzIA+JvOH42v; z=zSPLE$o&QxaQk)Y+ZzF2w&-`QuurcmOF=6w8z#Qu3rnCt-r zkuy?RA@=c2`d3q+UlIMK3VvnBgyqI4s|P-`Ao1{6IdsBaSy_jF;TEva~HUp_5^F&6&B8D*)6fE;B9f zClq9c{Dl&`wKfyi616O-O|&}?1WB*`P`b@5<-+QCQ9dnkPdyv5VXscibevcxq;N(N z)9Cr2F}1qUd~l{~kWVshwQu-Q_|ugE<2@ozgqb7M_DDK_g4bGOs@KGa4QRtdFt|&=i-yzXZySqdLX+arO|wKAzI)0xTo450 z#(XOJM`K;D+=$&6gBHVR6&|taJ=0P7EsPJpRvo2 za2-)T4?=h}Ob!k30acFRRmnz3kJ-%azyU;bxR2700Hs#gt+J0u?si~7J&XvdWEEQA z#EA_I?1JF24HKNR@)hf}0$VSPO5QnuHHM<$8#6v#1`v$V7MyrV=h{rVV-qH;ov#pPScC|r^|VY__ES=ni64s- z5az^ikizV_Q0<_1_V!`=4RPhbVhW?X$9;PcPDTXZa^_Qb zs)E+t(n`TAFJiRJFa4N7wl#}>g}7Q%=H8tL8(?YaRcw*;k7y)o`aJq;7;Q!KP$+<; zJyb8cA4@c}ahY96?@8}7jbO@h6)a=uO$cSjr(opgKG5r9bU+r(O`tjSsNfCuH40Y+ zlAA(nud!jh^e4)%%Ofoz(`Q=vkSTdx)VHKl1a*ZlJ1oNEHP_%DJ z#DpMqwe5f^ti{}piEI?qGvZ`H4Z|uq{ zTP2{(-k^e(A@K`oMa#gxK?S9a>ABiD3LF-{b-j!Ztc}p{fd4HVCl61Jc_!rWT zrDvpPiDNg61<0*4WMN=Qckye|Po+${Pu-=0zfsya@)}LMkw%;uo4$cj@=(g8Lu~6M z`@GYF#!{}aMQvlkof$JVPHC;|g1C|8=_^AN)AO%5RlpgpI|?wD(mT>S@JGB**0APt ze8Y#ne?x??(FAnH{wZzelsoIVX-PhmK9HU!>x*nEm@8AbSRYfT=zUs3^JHQt{qFq$ zk3vR|2A(A236G6CzcT9?)Vz_k;daJrZBBPi*HVeCFInwG(Ax3*jdUXYSbCmVY0l)J zwCrJb1;(@qPu@t6rPrjlrI*ZyMHgMm{;9ZG7lWq7`=#_VD-Ld8DvaR+XP`N!gHaYe z8fZoAI)ix-#rxXW@MqGaK`%~?(VrRh@;3YFP})cz5=v%HBMbL^i!EL;#beJNeUH^D zY|2D$0WPE;NykzneObE2V=oX_i33_7p3Z??Nv}zNA>Ec9z!s^AdRBHuPu9;=ubta_ z()XoT89Z2G_cOEYV^r&7+=?h+uZHkwE%?Db_CQfT__MuufYO@85Jkl1>s z)Gmbs3;DHH%@3S1+m`s!K&EkR}Dn64jTDSafpD?O5);VGhk5Lo2e z3T@p6IG29JuwfMiQU=QSk98#SiE_QW9^ubA@<6KYS-3VU04Uq z+)==QUDG>XaV}j;BVdNp%rU7uG$%RzpGkiqJ&>L?^t>!9Zy|eeE1T z3)kwvs4xk`B2HsGm2+eC)R3`Kr_9B6{x+44f!l&YBRdy6EZpX%Hq>M; z-jn{lG?ngVFohx^JD1^=^ei~gumhSCWZm$fRPi7NdduRe?Vi+4Q`zfEH(XfIQSa9 zB798>xv~cAiLGcu;5xOyrQMeQ2hyJt>3v0dV9fqA)Hv;`_oWZLaV7Qb{(x(q0WsYt zrdb%N=pE@TbnN=h?@NCv?MrtFx!#vfqzBSnjzLIl*Y~9lw&gFQdM!}mDV-9*`4e^@vPuY$fjtn)`40c@~~N7i67N9w)bKG}=#gv&c5m2hx9#Zb`SKmwBQ! zL3lx0m2#RB&!bn;yVAdtTIs9OQwCeDkjo=l%2JUQHE%48=y70|;6KwoVm{$3rdY)J zV2+2&|3m5f(t&hWdI@d0Af_J~QRRK!m1W%RYEsDl71@vz)v)=E z1U3m~^MdF4MEarhQ#U}#xW6I&FOsA$0)!nxhK(D%Fa3@5BVRoiVa4AS7CRqUXGQG( zyX2^y2&332(vPK$EvYSP52R-(|A>w%(4Gnf3Nq!=n4m-XUrDc%qnt=D@bDWOQJO5_ z_odfd4O`-vOgfid#c%9c8)^Zq`TNq(w&nkp^ygAZr2D?~mUK^giQ&ZwsrZ5P)16s- z#(h=<cbmwvR<{=6;yJE@jlmhMYa>0KUSX-@AK($D?kj)_!~r>-pm%@E8nQQrj& ztsab;**vHWrfE)aDbVW?0>}rVTjRKMGefnla-e#fZT!d8=u-NHw2;0>&Xr!&9YEsL ze$;0*UW@TH>3^2~3U!H^>IO4-tcmKu+(lzwuecpKuL2GZjf?rZ>j(*e?cbhX>XGH{gv2&Q+U=J*% zpWAGFFU!|h;&%r#6_W^mBAr+%Hzzx9%@VyQy*m(LB*K@{_oYM1no|Fp$=(Q3P4qqQnJxu9z9R`v z`u|gDA^j!CgD!0^=G^HM>HTf_UrBFDSJE4}?uZI18wPkPeXxD;h4d5Yl;ETQIh~Qa zE@;R5U>gF;sGc0rKPN(mC7y6&8|mCg8=d{xXL#8uq~{}>gRxIk7aYHC=gmo3CisxT z4qh3d2pL$;WT*U-3jbU>mHs1sx5QnHECNyLZgwPF*?hN2ie_??Q8H$=^aZFTZ2UXW;;qbXVTB5H}E^5D78Il^cwU& zHAmXL&c!8P+H!Gq)195kIkj1P5rIZ?dE{5Yxb8?2QQ64AiKeV#5Xi>3m)^}wBTqCa zeeC`@!Vg0HZX#1yf|c~X@f>0me6e?n&@Yb5yy|)5#cAbVlio%0HD)9@Hr4hl6!pKj zHLhxFut9jmv9(1EQG^8~fGcX3BO9V1tBGC?=Fw-E#|S!2V^Z9$|5wu6#)oYCiD!M2{4?s6ky@p=^Mby>u~j0* zOzds_;yG)RXgg>2<_oZgV^*%GKS@IpFubG@ZqFJLVrxvieEh5EzpNVH@;@7EPacZ! zG33jgyv^2<^WFNtklwR#Zn7)eo$?>POvKv}#8ia8CEU0T8F>c*HAe0%Y4=Rw{}Cus zBOOZz{8^7ew8H48=c$YrR9QOK*x0Ze_dsOSuJ@rTt^K$A;uYL|*-hI1KR*59f!*^f zcVAracxJ;tbgV1BI*VyT+uNb;lhHVL2ZQal?U-DyZeBnfFhNy zfk2}qHr#MgnToELAcPlSxY5zH=CHKQO(F;_T;CwV_Q@*T+Oo$_1`b?Lmj8MuPw7+d zJfe3X2zNRBkf4GDt~~cVG{V(2WwZwbnq!=HO58KT_g)N>+wiGNnj$CGL=Zrf-{wa7 zhX|kW`4s!J{*Cp2W0k;=&XeRn>;nigXE^TMyh&w*?9vw?NON_6qm7tF9L<1U+3hZl z2IR;*V^xC{=Pe++A&5_vZ}xBIWi+|NmW3PA7A7j@VB{D}N zmrev*gFKvIKnj5F8g!?$;}v=_OfV>dFf&0T=KWr-)J%QyWCQfKlEUA?#j@)MG0>-c zf~^*GpwiAX%1D$0Z|j<2M>M-^=t7Jwv?iLr;LoOJg|0ti={|=L6;J^}5K92snh4?7 z*8l&6^8fWN?%1%E>GZbC4|?Vmz;aiLjr{Gv4&aj!5Lc|#8d`hfHar7y9g}ioMk23Z zJni-Io8M48|9O6*aQcaKCf$WCa4OXhm-nqoe&9e(IN%x>x&ne|U;{@J!jdNM(kp0- z{%K!h1KC#eAsoWttbYSR{;eecpJdr0-WP6QirtU8RoWK|YT8{9uClWRGNUgPFe%S~ z^U6VTl-Y>0vVRh0gQ4>}-eM8E`W=5vhXQ8BtF-0B?McDI33u1NhJUlQE%XW<#Z;{5!EGh{g6e z^RVF~!%SNiGLo$*$ax-iEyl4FCk?!6npin%Re^jxLCIj6IvzG_7BR6yakg zMs4$Vf`YNJIsd(${NS$~N-bRjM}(y#Sa7x5B+b4hZ3pKUD z4=HNj!y>e_h*o3&cgH8Ui`)Jewd6#*FAkE9lVtqwf>E;jwt-}x@z=+U@Cee~iWxIY|Dx8O z>fD;Y5;V7#K9m~i9^qrJ88#Z8_QB~!V0O99;apmA{z;#ZLT0Zf^V>loyHWl{f6bz% z-uB;g_not?HIqes^-anXz`_{X*#;nCY?RQMbWeKdeJ~lB!agT>EbSFSTQ|1Nde0v} z%8b8pjICO5j_)B&ET4}56t)Boc(PRtneSftFuDy`WW(2u zjf|Tvy8>(I$|$+bziDh7R%=sIYpmFnA7&DYxq;%Gp8t^Tuz>%T0M?TyXs)a9KoIe1 zo-9IH2gyJ{v2C7Y|MI@^Yq#`&DE}QmKyBRd5mQWJnxp#pQqKWgF$5wKio&MsjiKTe zg9>tvznzi1r3hf(Sko8Sv+@SPb^@4i1oIxYF0&~-@pWh0Y~-Ado&)M|4PjyA?`PEE z7TXt38A-KoJn~=!v^3m4-g&BDLn`!$9@2g3n5ezQHmt}(hf*k{ed!J%XKr2VB`C&- zSt!$OCTMQ&Jhq8AIhN>@MLoQU2Q2`*BZztpuwel=GG*Z>(xYuDg#3^D&Hf`hT4}|_ zJLEs7_kC)kEP_I|WL&`M&RJ4X1de$J4a4XUY#>A~9ZI)s%ER@=RUbUdZT#(;X`oTf zFFXUTP{|c;C|Fz@98k&0Af5+)y8edi#>QZ|4nS8V&T}J9 z%;*}p!XKn%JNZ-t96R@j_ixw;QPO<0QwuXp$XyQ5%`8?9pyXUSyXoRX>{kH!E8E4% z{`BNOWTiELH#vJaG_k=^mNwt=T8G1_UBWa^b+8MXD)#6e7QgEJP(c&H9uoj+qe@iN zt299huX))$ijVaW2ZuT1@n?uh%SSGz~B zOZT}Zk@_C;*DH43``hx*Fyj$@wdNixfH0XvI|CShbY!NpB^cV54lo2ckxa*7UxDw> zu!kvJdW>%v+Ys306Dq(t#hlMbpJ%XJ3kz%tA?9Q0htl^3?R06)pOL;SeU-tv1#^s7 zgf2_zE$MrMUdW8i^A$RQk3kd8DMMA#htiLwAGrKG2T<4ul-An0avO|QlM`sb15D}n`o3@m;Sl*Rp}VyB~da{R98tS(qBoZ+wvDmHZ%M!fQ*n! z*!eR*Qbbl6*x2;SbLk2!XV&K!ZhZd)s<}4CXPUW_!|EN`Ebd}w0F*@dob-p%x1=vf z4=6^QFlqaU!p<%7sGbRVO8UC=kEAaX3{RoYd>}oPM$#?mZR+3~U%cm}KbHQ9^nx9T zlfk`7PyW6%lFpzxHJt;IF=1NV1LnQfaq%~#e=hx=bkF7(RCYE%BRwUJq#sK^B+=D!tUhL2gK)AH=TDvj$Qo1L7UHVh$>(V_FLCa%T zsL6}cOVT?rl&6qxNxv)oW9b{xZNS3~mNciSDk0%7O5c+n`<=fb{fYE>49J*YjcvSM z>Iiy(cJ|~SosXkF_%pTc?~&m$+9tBitcAGyFd_`y-H00YC#i5)F){zXg2lHdJ(iwjqE%;qzASxH z`dy?oqcQv(Am>PWMmmyyB>hl&?3?E+(w|D-;Asn!*#tCZCOs>?D7_ggUk=tFRw8Hj zUJ(%PapZEfRUEyR#z?xuE?@KW7V96ed+D8cnJr~u=Re2LxH0HN!%kf63?HV_i_#xR zUt}0k>Keao098zTdRlrWYC1^=UXs2ceMZ^?s2m&kk?s~Y=-7|*nMp58-!MQtTS_)Y2acH-ZJO}DXg$@e(Ca^x4k z0PJ+%`sZSo*9P*htt2yaU}G?o6>xuJ2XrlXgtdXmJp*(LLpw6uxFmF{Y%**!Sb`ka zv9Sbq4FoqOh3{3*w&nk-^eyR&4ACrrQ=*x;HiY~zF!YfBx3K;blX*^vnKtZg=>-_` zneV)>OD{>S^bwTqiaQkM^q!ZVCdll40_51oX|IUjCziVw)V3;@-=K|q#01*MR6@e~ zudNt%{rsh!__=~59aXWcKZNB*{ZArH_U9?W9og9+(Ndflb00;dWC{NXx8XxSS`%D1@TUxuN5(AK_f`zBXInaO9#@+(r2ZoF+dFlsOIf9+%lC*uV?zD za>uLEt9F>)lz@6{$HH6zjqQcxa%o?BRr(^*ulhuWhC#@IlVVr!Cta;fBTx#mUSJ9X{P-ujO_uaJ$8%o8SPt2x5=9 z^99JkirJe7HlSwAK5qvvAA5L41?!5(%Z-XWA~{)*2sZ&BmKKqQi7VM9K^|EHBVX(! zBIT5vq76I!@V3q99nARAK!i~-S=&rlSr}1mLwFhF$~$H;|Nq?m=d&c)b>#~#SD~&* zS-PMVL6B%rfLgSA_H4}NU!LjLJtSX~O$87jp-Z6(rBYoH(#3o@_qTnyhuCC|vh<1(#0J(s%y@36 zpkGo)OfNDckIek~)<~mQ)N@)Y@GG0KUh_}Cd4)+ulI;uW2~z=E^kT!HgOlT7h{+5m zDU6Q!mR}e4Y{oA}OLxEi_XxHR9M+Qo$%nhyfLtyj0!?AzeR^&0k!H?uKmqM^lT*aw z5rRp3hhp`bm5aC~vWdkAL@1##=LGR5453+&{zO#SjZ`ouEWKnj(9HaMHB2QZCJ|(u z9sB?l7Ed0(vZIl&v^xIH5}t6xM6%&qL_5e&g|%hX&H_w#e$gYHOW^nccPA1*s&lxQkKPN_f}vt9Tsap)SiR=)}3Pv zH|m6Ho zww!Bpf?&*SHeJUb4v0Lm_M({cQ(^`~m_$x>GMc27qNV_CGmBtX$Wv64E{xhTp~e+r z82PJE+8Q0$LKxn91?-gW=Z2K0V3U@d@*>t~fZkWwjRi|^FgG|yA~X86?i?R09SBip zqP~?br4J=ZpMee)^@Kq$UQ`LLrH}aenME&^MO6Z$8|-|Ge~f(Z#2|=}j@Hv3|1{|1 z(jul7>zCPx)N3}P4}u%Z%QJ+mp!OdadP1cyZFcxTf1-mTqaJb?4mOLD;zS)gJmCue zG_!#GlsbNcAiIUW$qODzcaixy@l3R~CgQ--qlb8^#KKw|hdoT-fzY$217|nc4Bz6} zGwk!su!kB4UQvm<2y+lPVdQ8*Ik})gwBr8Zz4{tnV8 zuC6Kkq!&pUE%a%U95opB81&N@yjlH!hyJmqWCM3l}l zwVBoQT4XS~&~_9&#!TWhLzRPklcL7xI+@@AhL&D4P_yFt5Zeuwq`hsFMPnxw$6gd5 zO0|ts3P#NMhgve&rRAaVb1iS0IW7@J_}wtI4ZyHv@q-UIQG^c&H_G3mnL&{XgoPUv z@Cvg8b6e`9#iyoucck}66QM$cPK{;|HN>rHZf^c$LIuJTb^NX@rj5NE8}P6nOFMyk zZFqjlrZ>_@NObEzQ5pSzVP3j5w5PV-p@~XBXYoxCh0~W3`D{sRUvB-jZ8+fIS?%l* zYCOj2@J^ktf51~2veX&`RpCzqa`9^6jXd0ZA`z_({Fo8-L^QL*4Ge1Gg|Xf*`Qe4( z&N+sq;druog-#BM~s5hE@xKxHQ|;8?*1?RxJ%xtiGH`1Qg&Q8g(+iQOJ#1fK) z8_)xx*&IVNGcPYZM3i!;XGsN-DzgbDg^jj)VlZ%LwcdUSGYn`B=WT&#c-3-`uw?Q4 z&!mr~`vj;pBHR)|&$y|0LzoPaYiBnuWhxvEmA`qc#WjJMH zM-k5m&aa4ACK|!Q3^#Cggu=uN2N!S;kEBnc4n&4~gZ^i1Qn;rX)ULsuSM-WU(9rR~ zr*@J@XJ%{40YvR{X+IM%en~D~a6I6E1{x{w6&%8Q#EUKFcE!(Y< zccKQ+vcmUa?rg0$H(Gk5nl`8dX6#>-sd58Q17SUf7}h_3#2yg z5ol`z_yR&&BARQU_E?3P2SD@Tp;S+->b9pcnzNi5$xqFEvxnp^9X?=BU{vxJ(-}-< zMBm&5UIbm&sBMXp?e#Y*!doi_CV|~s8*oE}SC52GFz0d-H(4B?Q95gp$ac~JcW}t( zc$HY4jQ?HWp>qF;)`IE<;sD%0BfmLg@e&Mv7jnqnYXA|zCAWSF5LZk0&4kI$)0h^X zCGbJ?Jr`6}&MlIv5u}A@@Hy0ir4@J!t2K7ei6R^6@uLV`YP4-_USbcYAtIS(+5w1P zTJlS4P?((5zA#Syz=qENE$>PNM+41>H#=-oVdp-K?WElZeLFd!K*c64qVg3GP0ND5Ndx04R?JOMl%);&Q^NT&7n+SJiy|%cX z4R(HjA12H00boev+~(v*Cwd=%WJqX*i`tvO2JLNT5z+(;DjdnV>1+q%o7rfAa6viGBjIxyx}8Mn=iAv*rF>{hsuIT|uE*L}cA^n%zK!z6=!cWnbX8cUou zR;1~5@D$ByrKjfNY67Dr(8e@??!=bD{6kcAa;wAF$4uC$gPw4Zco5l6tOgi;$^!~r z6!4;zL0J5mJyd5@j%f^ik1Z+rc8~{C^l?Y3Z6tpm1BI^&|I}fEa{lSK_?aK4*BOER z1RCpx6^imwpsUmPXV8X1T24u}cci?zWyU*ND|;1eEMiz13V)zyD>&2|HKDY4x!j&* zX~|w|Mcczj{!4#>j6Z6L<1$+6O15Wi(fGiC)@F1ovY;RcED+``U+Xd4hAxnXzQM|p z>Ke=`XEaK8%!Af2h>ve~wk0x7?YI%E#^bHot{Gmb0CztaqTuA>zVph%eAyC~HW+L= zf^#2qZlm`a8tPA>Nv-Wb*&YSi+qkPxrh^4^3xp*24ukTmc&iyS&I4gyY(!~(b_~H~ zfspwjaR5Se{6}B-=dTU5T$}R z?U}PE14PJ&X9CGuA)G-na|hno{QZWrLhe`w5rn(K>Tx}TrGJ5gz2cGxI6de?vgiwP zoIwy4G_v$ZYaHJKeTYRXCP!)~X*vNY+%OgD1Y5O6=vK$l8=ok|M-ee9NVOU8gRwYz z87{5TmSDsy>j)BwSjF@AG7wCxtyzY`VvowA0o|5D)|S|1$Y(OQE8{YxgzPS8_n62O z9y5ptXBM(=qz~+X`SH$gjQu98Y{0EED%0CZ1LS^3CTKI(EC|`DKK0aR5_ZVk3G+6V zR`;0^Kty?CKw=MKSoG|4!|39W1__7AyiEFliCav}*6JcOmu)=IhzS3KVc|`0d2zH88&Kqoj?cm9>SAGh()J7*x?=7##T0V#|QQ*G`1$jO2F5 zsM}F}jp~zZ75Y!NCm+y$t8lIYVN0geg1LdciMdX4?m<-|+}h zFOOg&>E;{-yjAo&ve3De(l&@rZJ^L<@kGikPakPSAw$T+pF8xL(L-H|(Lk zv`8Q@mh3hKNY&UgOh;w_=F+ku(TlcsDX63~=_Tu4-J9K*n*%A_hQ#96cA{C00qP9S zXi25Y_>h5nccKy%k5^!&Q-TnIW^f7utX{bp%=!5!bS#Ki0xAumT--K|qqW4LLfQ&a z{w2mdL9_SqYP|{;R%;2v91NNnS8b`Vg6t}0$)pC1{sXJw8F#E zx_jSmxTjv;n8ZkVzmfY6x zy#5tRYj_H>bp;Ra9Kd+@ zD?<*hnIhBKDeU{986Ceuhl!3Pz;WLLMrut3X<)sAmtHbKaAOD8v?SGglAykn9$0x{ zKzsCx?ywaR$}A@hP);}ei&hptF-ISXD{e2=y6g;m*p98Lqz_MNbCJcy|qNxV0l@&)xln-p9XG-1!{SGsDh3 zJf0*a?tE^wkEo*r_d7>#PmY;rokjE!$JSOv-6J$RE4PjfU~UYHn(%F7W6**EHC+S_ zd^ezi3ZoyvO%7we=Mr_pjKDeFJC>v6+s&wso8Gy$0dl$d5ozVe!qLWFyetjF@<^ga>-CYfCLcTXsYOr_f`kUm>^B!Nyv;gUGdpU30?y z`WPlqoH@_dnb<5bB%M*S`V6eV4DReO&ns)fSh7a97BDi_CRp0J^a_Zo_Tfv4|`P2US`XO2G-gwlEfM(aGi0i|(GTWs*2yfna*aIp&y!W_en_64sr z?GU#OUA+SUer^eKa$xts&|6#G`j`Oe>)#m3Bc8k==WYPQQ=x3_KBzUj*VycWGY+?S zN;P?9?dCVn?>2OX0}JG4NgfhaLOmx7Zn^&xsfE|Fv1({M>=+0;0Nu`UeI2pm3Ab%n zh1`8;&M>WWekg~@xxt1<5>pu{GWaANdMMq+?v@0bPjMZM%|`1X#EeSiOVW$Dj*UHC zXTt!OR0#((N^emwI2Nb@EeX|_F_CCoKFsjadzL52J3%LlEU!S2ije@1gE(*||uldign6^w1L1omf9) zjp{|SUlw9}N}pM0%&Xd3Qny3}`^jmJvf$#{rgrS`#AmcoFRa`<2;$RIJhTGD718(^ zPgEJ7dJz5 z6Lcf~PrUgV%tPh3@gQQi3c6kbh1y{kW|TxG>_^b-qwZ88@RjA1kx(Dv z6*g#AAV+?|6`i0C&Vd6T%l;xVF7a()NsskX^<^ zta#7lz?OD~)aoEFq&i@1$+uzTBhk5*xm>c1+QzGlBmG ziJ>wp-?3mB->J=d4-9y~!#V4>hA`9;RqgPhvttmU!>a9pD(86LbL-M5>7kyF9YDu# zQtxtI0csV`c|aS z1<#+M$r-`Tffwu-Fruq9;8#v{y+d_Mvqa`5W7(f6k2s)0!2&7aBIx|nDQ|@HlQ-&8Xa-9`G5ud{w*xLo=4!8 zA!iBZh%>QG?0i&4VNuZ4S>! zLV`ILl<06cH%q#KT2VkySQt*$nh!hm)1XHBn^`;Gmgm1_3FjzZX0@4t^4G(M+?t&Y zv0WN1qu;!YBQ`S*TdFWS3ryqIg4BNT`!@*~zXgbRW@l35{LISAVof}KYUA6W*-;SO zK);QW)8W|tMLs$sI+@e@8i+zUoRB_YfYMFw+nFSd)k|tZybKs~bu7k7be>7qoY}Lo zuKxgd@=*a{Py0bej1my{fsn7ZD0?~uE}+M}D2Y@cm{WTkYi9sSUj(k%yeosS4$>ns z`mhK2FvYrAo)$%$u@(W^M`XTWvvdq$bwi>c%H z)`XU^79Gf2i;)XxW+DR!F4!qvDl(%WWd)!5eAr!eBY$VqCn{bc5Wx+(Q_WruxOgpI zdJpn)VsyIrt`PEOEz29@OgLv}v1 zJ8vjLHJHwg^btI{3G27V+yx|iV#X}PLzNbX^rg50xOa!TK9w${f+qSX5w^!}v}GD) zN4%Rn*$z0TxQ#v#&%c&FkUo|kTF=sz)d4%CD>?)E0BCD7GCRu=B5*7*&npP9z5H*a z_oa8~pDW0kceJiH@E%Xm&gnjw!T^s5Fq80HNupGeqQQqTZ7_1IfPFS>VIYE+(gp5m zXTRIq!3>15we)?@H8rup1x|88{+~deDv2_$uuu)$+Wfc)AO-txOFxwk^c(Ehi%Y1h z3xMR^h=#l6srDxMrR`-$mrQkAABoV-CpfKpL@yg8@|q)x1z0m!f_g}7iUMLZQZ4}x z$14{#;uAYiSmwi8D|Jki9&T=zZ1@|}=cSjVdla-vJ0Emr8rECl6zRPoeT(xcSH@Wg z2|HM=bP#^7rMIMS({*raW#$&>dxkLTHXUxw2F(q$?JS@xN#{Na0=tKr1V?u^{UBc*FnozC`7s!q? z$O}UrXEfL~l;IvorSwzACU&8Wna%6ip*hhGSwm224XGGM(AU!2*q=N6Zotk_e%@1+ z$UTO+kbWq=PWZmVV9l|$6o|VKb@2Y5m`k{ziI1`gKb6QMi&-Gc&_m6ifI+>Hm`U($_f`JL0I82dNpj zHBjK4^rrOZ(oFga`+P>qwgddg8PHT8=dV#>*qeS743P*KUP$)<)%s8*(Ca0JGST(! zf^9}wI>RLO4lIKJoZj}7SulkSwb_}**6up}`I8j>xl~aGTO8wE{!pJ-VsnGEjge+%he~kzo z_@@DEX*>*Q0Zdv#ht}NdIo!q^-_n;t15n?gExDz|$qk+J6sO1FYplW>(qBpsrC$TN zJVAsuZ1$dsWRHT5lh*%D>2I*%ORM7+M`ef_y!z(iwe(}@FF95@Gg)paj9#GNA4~6} zRD(}46)WjQXby>abC0-dXI+G+MpNuf$SIqjSsrwN7~kSO79$)aY9O5z3Y<*eT0OyG zHgf<;XdX*{4{`jf98i3q0(D72__6dI>HBmj-dy~Q2yb_?7hI(Gu3uN7CQnw7!gS z+Ox?^2)sX&ej>f=OPJ~|-y&pqkubNR3$SD7wu}VZ92=nYi8j*K(54L$_!6Yx6jR-^ zn4+|IAP+l5t7>E7T@TwkZ#M5ldWk_oH9k0~qtOgrA5X&b^?xe;K)OTsd$2OgAs6if z=`HD<(fU7;{(;=%b3}8AQL<*zR)QJ6C%x-SSW8c(f07oY4<)eAN76^qtK7MhzAt@O zdVJhXbAY~ZNf21GaxGZG6?^oQWgZwXk?BOaPQ>1428wh}!xhr9H>^4~wE>6-;zaz) z#QHyy{#weVr_z_D*Q9%_+!LZ^N!QX3r8h?=7I0-IR3=wwKY!eW9e<_9c$6K zG3)<;V77L8MEcU4|CRJ(=}o^l)V{KaufkeAV0O+GMn0NI_8!IrDqAoPDp-I~TyJ4n zY)0NYg!I1E7aDeWPLNbf&!k7vg-;?F&{=lUd(wZGo=E>By-XdpB9zPsPJSeP-#0+g zLHbzwdr8udq=zodm}MS>;D~TC;dcvU^7pmqhu|;Fa22h(I|*acm8|nAic_C zmsso_D9T29NBYri5Kskc;FLuXArRkA4r{a&qy%|EV`Iv;&w!uqI6MVu1jOI9Y) z8||7(qs)v(7&VSSqibVf;eX4%l7BK*C~Ww_`n2YjL6$b4NkbI(=962APehDY2<(}meOo(B<0u0V(eb;|GwGkK2QJEw z@tsS;`oYBsMY@o_ZKszNHo~+df4dy9DAL6rNY~Pvh+(+N3=P{@kJ0#((`6Zif5jIh;OTSU-H*T*{_IuLB384e(4 z*`mKAc(#L9gpS;0$3_JBl3}ibOgA_8a^o>N5VB{~OGCvv3{XvX((<+ z3d?FkgHI4hXG8qs#J<|`P-7ZuX8%NX7Db1Ug&9PX_iyW^N~`cQ=~DVW-DMdQ2BMqg zF^6jHM;9mMdCZGMy|zF*6Vv7q8PK%meGee!U`^)@wqRoOVihXTzBR)D7D)IUH!!CR zQg|^mym2s|3uCwWLI$s}i5g@%)VY8|pGsF~WZK0ACGP(D$5=!Q_3SB6<%<+(U+daC zG9x?bW2@0*ToC`>JZJZle+YaybUSX_P2M>+U_rTHLoU`C0Wy9!vv5bp|kUR6sknctJP)fpglDBqq1~$cs?NOPC6zA00P5*+Mjvs}M{0Y)G={>~-Rw z={pC@aP+(JAk!C%8}7{4#nMJcjaY*BExsI!R8SStuOwer*f^JHOBpPFW^FgYKk2L+ zWdomg+3 zn((|&s`tH*(~SuPYg}R&nHn@A%0RC@VVy^(5Ve>VESnf9@CjWta4S$@LxhS|BfH*2EpbegGNc2&5qSjj~2z zM53jE3YBFu{ml)Fb)kLd;tp5iH&Vo{3ZBF%$o?_TSXUR^?V2Xukq8FV7E2`sAW;E42A2C zPsEEl?$+?P1|O1QopWd^*92C*1Ic09uW%Zr&8AzTOC3GO^M7OOpFS-aP)785Aw5Rz zb0qR$*<^oZ&?ll%I;V2(eKOM9 zOXm13`11;p&TUB4$_FxJSeeA-=m6jr*4tUqR27`~0f1LCG%E~MO%TwaA+hMiFJ%4y zwR{p6=b`j~i7K5DD?&0(;F0ZGh|_|nk64*IiDum4p+&;NvN`*|F`KgCshvyfg2*AvI=0G3@HL6TyC-h4|%{@Rv`bf8L*H zY$CpR!-8_l21N+6+@PPkBZ3zt=L6;c64p&?J*t}#k0knN_VkKZMmvfcTw|_#iUs5^ zogIJI{_}ZIo09@`%axye@#N>$i!(@f_h9IxY~V4p?x5%{Sm(Yb91&2BB)1IQc)--n zrS(6J38W<^?u5quy>(|K&-~O1^q(AoX3t%SfU`dN;^Xr1FTZ(uP6a;xu0Z3jI5+eh zaSw{}Zg}{kgt?B|RYxahNn_^*47-^e<3oYSs z4W<3Wi}UAh`>pa{fq%`=`z_|?fJTTlDtw@1{A%zWV$RnAXAtb$jZLzk~FlQ~%pOXZ<^?Tg|NHBsJy-4cM#f^=K_B zJHWNEsjoqNIasZ?*Z=st_`N+Ul+j!fKOJ38&n*H6k2MMxVQZqSvz|SA`sh$@D<(A6j_J9HN*3UN~!NPO?VP~dtf7|*8 zMb^6>T=Ndd|1^uPw$e!rigrpioWhVL!ZVmBSdjJB9RB5!Bi28|nGXh;b#z-kWB*gz zZVs<8X0q&#$GMlsYmmAsUzZ-TX***xOL{E5D}8j#=1t9LpJC8cZs&+BX-0e|y)V5( zOgy;rne>YEGMQ^+E*Z^qA4wldABUvVNdKAis`MIQpd0WgcfK#Zb=(e=)6M=mqa?zB zbO^wu^nvu=@e&r&9qAS6Md_X$dJxe_0waGWz2!aNe)EExG}KKZvfmiPv;}bP_urYc zY2M-)6-dFI&*@9^nSZ?j3VKS=Eiait z);WeIMnCHc?tI|(*)arHVCd$?GW4o_hmO^sm;om%@Urv*RfQ;^CB07LUOtrGk)9o! zkUP?6q)*wYo-xWJr^xrSNfVOR|5dukmKHLHTtAWCn^c~RQxWda!dEddpf(?L%|Z;8 zFvD-nP>eGo+349jCkASPLkIX#1HnJID=m@Wx#@B}I!RL_2av4Km!xk>zahPBwDc8p zfLeM_`i}H>(i_gl%cXPaYtkP{pXDU7N75Bzn$M+LdSCjs^j|pFXK?2)OTQ<5m5I(7 z#J(1I?LFzc(w|E|W+aa;p|0JHzR_b06;tI^4tJ= zhH@oOaza<)`nK>np5j#cqVz}7Z%8j813p_TiP7In|3&)Y@%n#J`d#Taq*ttMtmO+^ z=_Bde(x0(;gFAm&`W@*D(#v?C19mIfpbP2mrT?$=gX3o4)@-mSXvILo$I_nD7I&kW zsL7_>=d>bWwD#<4&Op%2$Syqxw>7AuL`ZXM;Rz~sOC!*9BT`7=?@M2k?l3sw4CYFN zA^#}-w^2W7Ea5k$&r2`c*x-1a)QS4H(w|E|aS=|N_bbxxN?(y)v*FQE&D}7u%>08oYNIyPSp07&3FMS1R ztL!0@CH|@Ox6)roZywv9f>Bwg*22};K}B-}VL{LIzz1}etX5dtE9t3CvMZQNH2AdU zVzxH1GlSWZlIw+$y{0mNWNyDG{Rin!q%TPe;-w{HJq4L@B5)nOX)LQ{59#6x81YAIX$?i()-efzW(Xr-;w@6 zdX-^v0#)3xCPb#~IiP{c z67IkpqNaba5jYvGH4(~=*MB8_LHc9q-%DRcexH%q-UZIPkXF+Btp7m%|5*AX=`~u? zYlO1nJ71OVNskE8+t9pp=RcJ`Zv(_i%xl9~lm&yor+W#v*oOz$fNO3Lb@-M*vEPE6 zv%lxY{><5h8Mt$fG3>i2J2NW>oPLe#JD_cm5G)MZ8C(D41OG(&rt~VaM^^ef$Po=NILkJ!~boHiokpw05%9Aes6X{Q-&sy%5^mM~2zAAOnyS{lLJW+!Q zs!b?gVFAdsbqVyV&@d)rDcv)`bW7ZKPI;yS{~2svGEDbvep5so7j&nF%^DP$dP_8u zUXgxF`fcfRXl&B8`!sl*S#b8c^%H6)y)6AU9yFS;=XStWMsU88?mCZkF1;fCy7WZ^ zXvLC-E2|+jydu3Mt?0Jw4N!|@#_De=h(#7CJ9v38%3%j%a7U^bU~~XP9l+eh$ZwYH z$(FW{in%mvSgEH#s`o}b-1t<|E7I>uegu(0HeubsoV9*;R^d}Yb7tlZ5=XTg48)tRMV z;cGg>KgROEkUl5jKZ`;V%3m3 ze9D{NltC8Kr=;JNzR47)1ITrtKn>|gqD)%(Xc)~^qYBh$ z6{8GNfj-t07lkHlZ5+_eDje**j1qBe36{=~wgNnV+{W6W%>jn4`Sy%i;2n16m>|-F zZ{Lt!kveIE^}TPKf61^Rc`Hb|I4RHTL}w*)`!d$DBAQ+Kv8?Ih&q-eewtFP)$c8$M z)}HqtD9`byL$+q5r3>)vFf#jNEHYFgGqnpj-4hsfCd&g9Cns*Ja4}mOE81C)S`>uG z*7J_^tI{{6FL8{cS9LmOtT#BhRV-oB+|Lub7AEi_3CviCW>mor@aMPjJ1u}wWpvA6 z63=g2{ElSv$}-TxYH$^UW@6IT0D)M;6H3X}rPOdd#I=^w&fqGwCk%oh&Z? zlJxJSSMb-Nq9x){SXfaV3sA;O$0Ve=H3S4PY>VR_2yn)s*rz;muy9(8NoG^oi%}z3 zupm~9`e?`>BZ6C_l{>U=V1tqmeT@ow$+-ifAhpM#H@K1B1}E+LqVyFzExwaZ5Uc}3 zMLZ5G;?4`{f%JlOk9ybM)U{$j^@agx)21i*FLPd7eoLRpckZvz&=S} zo=QKJmeR}Q&gbYwjOXedz?Uc#HCA^!@Z?WT1k1?)C^yNi4Iw&XIm4On5$Zt_bzzMg zJ9a*|1IEOubsy7M+$*0j*Cm^LA!SU>ZS0uA!r-96!=IBNE!gdY^^->sT@E{QdgXal z`V@7$47;27dut$^lmy>(8=ltxQ_`nw82^%Z;$ylK&oS@Y+ZY@*BLZui_ZwxLmPjrv zN6$s;jObj^N_2n@l)k;CdKrP`0Ds*vBV}%UlZd`6c2UwpKoE@`z>?|Jcd$|!Cy%$o zv3^l8?FC?ga9bZ0PfZ+#bFE;TIE1@3Z#4qj8u)g%))<)N=JrXzuzk4dgF6QMIZ z_q7{6|AIgxcut*_K{Dd7cEp8APVb>~AGGk)#7Q=J>QuLXf~r>YpRElr+yJL6Y%c~F zc8v`Q@bA z#)iL01|H8+^UfVcsl@n{ZhuOUf*?O-=vFN~h86A|&f^R+%uZ*T(_;U^A9eP-gAAaQ z?n$rPocZMC!WnKb0r9+*`>=rf?^q9hRPQoVwbGvA=D!EqxZ!B`X=K5No@7d_9R+Sk%e`kP!Z%07a2b|VD0_x1)yrfI_E_p<0b)U}sW{24I z>04!ol*c0^?VGc;X$IVK1c3S{XH4b;szf`RU~piA&e`eRXy?-XFG0R`(i|?(DPk_^ zLi%WQsCPzE;Zs~TX0C&mc0!;y{=}Bd=)8*>M8G+r1v#a=0T8*9KDJ)z(EOB#E_ug$ z*x7PqfC3(hoGenw5(u;5JjRJ2I=GqwEnVS>H#|jzI)gUa%5v&nJh zl~o2BY8|K23*2g99YE7ba7LeKfF&~gIjUk?j!A1`{Z`To_%NY)(biGeY`+$MOHZMz zjHh-+lv7&_SQtc7SQ36?4|5#IfxxqnF4=sK7iy5m2?s0;K(vZwSh<-o@n)u{g9vDb#mQ}|ee#}HWB?^kF(@EJ zWD>7h7&Yrae6=TVnoVYq-4JxHU@$axq-Q}2UX6$tQQB;56hLL^Q>2A!Mn3O-=flx@ zq?VCs9EQT%Oh!Kk60CFU`E%9ZFmCc-XG8Y#R}yzLe$1KtBntO#r+2XNQNS0c%RCM z;jO)J3R33B@)I=Z+=!Va7JEx(SyJ9x`}$XQ%u!7zb+kavY^H56OL`n4%1Wp9oKdlx znayh_&1U~8EDos%sB2SKPYd>;g8^Ng5IgO8n7~4EgMLG#`yxhd;9!f=1roNi2xK(^ z4_ts3!?8bm{yZYv8VQe5R1YM`FpCtpZ_VRG9W6I$>+gI@Rw}^2f`C3+Smubgjukk= z=ttBb#5i1Uv}1*A7j8r5hM%+uzyb_*O%eW_zZ!V&IZB^djj;j_sI7KB-3jRiWXqGE z@kS-FbH~1RlRl(jon|H;Yw06g!kijRKDv2kZlGfcJHo7r^MzZ(B_?jZq4d&F7+E0~ zwVey^AaTF`DFAwfRXCWV4uaYqtK7H^EUo{rGsW6qtD<}vzfw;+V9^EUHt;Se$brgi zZX|HiJ~v|==tkfrJ&YJ0Y-Ogeuw!r1!JW4z#+H5w>SZF+y;QMV);D>CDt#!X=884`LyrqW?UgchDqo-0N$a>l0IN@L~6J)u))XCfVd=cj7`xy?{(Fko|wX~W2EoXFMzfL~K< zTUcM9Y&zG@Qrv#$BR*UMJD%|f5i823Ak~v9T;iX0ATK#ag@yn9dHN3f^jZ zk!QSfJ)CkpE&{i(YXaxmS{X8wvakX~|Jc0&y|j=i0OS(KH)DqvJY?Z7kUD$sP)Dnc z6fz?bOmDn(9(MqEftwc`UmCdgDK*jgaq{g3WDUqMCuB~___5V4b86LVe}uwhCH#1a zl*N7=FkOd{40MDBfmszc?WREbo=88FKECZp$f$#Dq^qM|ni&zmlB3@8AdBJoX;BPlnQpsrA5>%CPVk`(Fmk}P zE%r3LM_+@f4J64d#-3r4_Es2~pdm&Oe}b3k5QiNaTLDrxj0)_ZKkdX?dMb$w{Y7G+ zXi?ZwjV@hzf|e;$l51n47ChtDdbB1Wtj22OGm@Pv+p~D>Y81!DsuT!$XTDnaor*8d z{NnUtPe8u|JJ=c{X@LlzjJ#j0e@-=jNrEdjHRnfVT#doSB`>@-+E`TM4@N9s!1o)Z z&%ya?keSK^X=ez4CNr>%Y2){aWOJ774iCJ*&R-+1JqmsxagP4Uz)mXjn5jbh1SuK^ zZ~?owb}ri%r=1%LGypFLpFrXoDrhMKSmD7_W9?H+WiNd#NyZB68^Owsc9I}g9!4H) zD?PF}vV5hxL!WPMJC3bYs{AxW5)7>!ukmSEx(dt6&Jv44d<8FpL*m&lfs<ZT)ND}7oZ;5QDOi=!A_79Ld9U6EX1Mh&TTs|mcZMZhn+lAT zH$^X}ma;U+Nj`kXJ$YnL!`;(jURmNYTKt^+U~`-S2ZXdI_gb>}SExg9Mdb3|vw&vs z;I9q9+9CROV6sDiW;8~2{8es#qL<;clzTe?BfQ6km}E}iR~{Qc z4IOAkuu+)MCIt$n(Krdo%tY)dnO(D%7bN(l^hA0BC^j&lYp}V-n5cam`PBdbfB;EE zK~xz|BSQ3BO@H7oQEWUTmD^B|y#$h8Si$1v`GbToCm`A|Hsyp6D}8&6Uo=EIkyl2U zI~tjyIJuk5-=l`mk*wBs9%4;jP6k8rq4Z)j;)yW#0CQwc$|$hB=u13j(B^=Jl^MCT z1#rxyHH|!xNK5q_Uo za#`FKJ-8r6p{TZcT$Jw*R3a{q`J_3qPEOIOM6LpV@6d=|{-fldptdXNBb&$BnoFMO zqU~*96ryVkoh+1UvG5`zSvnY>*-P&&{7k>HHuIEA_uK)4?VeS1Ltn8EG^iW4ga&SB6j%ae(P%5SVPf9bg{_sW^_D^Sp;Fg zBM1p+5IbC0Q%QasiZLS^2}t=GdmOoZF=~YCu&+TxSnxP;&(f-V(->%r^?xRPBo)%9 zq>c$~@;(R)2r(zLiflVJDS9xUNgqm2ZliLyJX{7YE@R5{8ALR~JRsiPlXz4H`ktZZ%gF_p4*_HWT6=@Wp2wbh z5Y`z(Bew{k3TIUA{eh}PAG6o$XQ~jM+A|If0>Vhn8G*$9Xl5- zQ~{0E!#<3DjhK-r)-=@p&dSban6C2G)jt zC*4|McUso()W(tyGJwL4{K+kRkHjHu)xz5Ir)ju3X-dP+U4cK##?Mp`VFm$QF|$ zxg_n}nW**E;ef6yo1e79Z$t(VOsi=OI(9U|r&AWCJ|eabVDJVTv@pcKq3+aSx;I9m z?u9S1?abniC_@E|ej+`X%)kPFx&$G}jg6i&&^3K7t;2Yr`flGg!@;9lGOH43u3s%d zqKNck#`(&3hJ^l%cFM8*2=M&iKr>s6qQ3+1 zQe%bV=*0|9M{ASh#`1H(Z(kb=DZ?4>0d*>fz11jy+f#j6at1^6pBAit#0EW)DzKX< zR?mP6H#};R!c!B|Uh@XMPz}xQ)lkLf@OmP7@}gT0pw6uEI>XZ}aqB@&-Z_epkv~^9 zsB3P7)&@z;k@NOA2<<3h1zffo#tU<7(9XTjNcy&T+F&1MoSm^J=S**O#~na3toNkt zO4555>)&7rKVn_R0@NTVA4#QjAJSd}?!3Z0S0+;9SLlo@SlG&W|4D5)WxoX-AW_h~ z55o%j%6O{NOx8fTS0r09@F0q_^$3HNS^qOc((7d@{N5oQ#mIX^ju!{RJL?0!E!l#BV4nV3iR{jpdIV-zQ zg;$z*cv;F*84WWCv<=4mibFceQ3P7ql6pO=v+~x=3P%0@)zSrnXez91MxfZ3MqT3) zn#uJq&DVsUM{`kQM6_x|LWw2F38N}p#~J219^^1G0TIv!epd3^0X3G0Lvh;{M6=B$ zc01;1v>1=daGk{neyMP7SFC(c*DI2fYw073p$ETtZ3U!`DoNn#sgiY$+3CHXjv=6q zWjI*)8pb3L_R2k&6JcbjQVu-3vdC(M0bO_m9rf!y-mQZY+MojgOdlqE=tKIg1BP|N0d&{|uQ!%Fa|cM= zGBI(SK?cLCH8`Nf6Xl@q;m>;qT40=3{AU!Pmd0fZJawW7A24hMmb2i>6lO-sTN%|d z_r))eykzK7n6|Wf((<-5pxCAkgP6zU82)Pn0s#=I!y-g(aSawQI~r${$H} zPRdf-{I8b8aXUH_JNQGbA@;!l*)t3<8!FE@gA56cZa?Si2WKWa3tIE=z5Ik0%y_gg zUROhkNRK~<*_h7+V3ELy>i z1W7*g82A88(AvT-t>LreH7l~A{xhz_m6*$d+byi{QDP)x8wa0;z^}Bhvf=43!0nb6 z>+}Zb8oxX@_MCu#D&n(Cd)2W51oF|D8Eh!OR>m(M)Nup0s(74=*e#6F%Ix7s)}waw z$w4lhNHV7?uqEo8bGE_+h8TYHo=u+ac(enGbzw!EK96wMQyZev5iez640jmRIM%`F zc5BU6&gmT55JydBLD*K%c3!Zt(Q{IX^2plu;L{r6?}*oT z7{CfQ7=2w65NK2qBI=)8;WHW{O6i-L`rH&*QG-g*LJ zk32sae>!u^?tL;IwZZy!He-6cg)+ftEy1a1A>Hu1=N>~820L~pjycq($fjbkCSD;b zi($Dg2wHL`yIyf+5RrN}x<@z8DR*8CW6MBU2K=q?&vEm#XAj5#TFmx_cNr6Cdj(jK z&fSqR>5=rzu(p6#TRN3`54=MxmcvI3+RA|}GRGtiDrh@~&lDgW2ZSd0dJch zXBgnl&`S|Mini}_N73@3(VrPrX@P7XtlNB|f*81Ihdx9pFz~xQG|CAYZgjvzBEBbl zoa2UtZFuT7sDsD-lmh=LCM7sGHBn|Z!qB}~dKgx{JL0u7nm}p;$R;4nJtsJwTSDI$ ziV-HU9K`^oML%mB;ZsZZEI!VyF}ZO0St4QSR#(zf>1B$Yl?}4$CLdme3kxESCs?E{ zxMFKVjU!_2Su}Ga&_?%dMhjpv+Sg=510qt!2&HJSDX7OR%&}e~)D!aWRy0ZV6~|kg z-~dkW-eV$8#K4W69Tgym%kmZi=>S5_*}oRXb!S}l=qn5TA5;(f8bAw&ud=atk-S$t zRgNngvmm1W9@^^}X%wf{oQ^c0GB<49sUe9G0n0q|n7|fGUr?Z46Ed8!lM9Q0b4qCg z7EqK%6T~?u2zRzj;CZD>pUf<7h+{ibz#tbN2u|x!`rtcRAvD3LSsD_%2alcm{X5n# zC}z8JsAs}QyD+|XEO=#eByvrC!q+E(3-ZW@+5ENPQWFqH#L;2EPjO&T zl{g>>ThCx-;0!^!4y$kt5D^71-NvKOtWp`Amx_m~>}&O?43rsGd`<1VCLv!KZr$R~ z2SlEdaA#+c(28E9rKQCoLfPc=N9X8*Asi)ryK@%olFo|puhdp?sJP+IW&@NQ%RR0k zMuHNtT4C2Tykm(C9yhYauLJ^7KpZ%u^jG7?wj4<^4&tNGQyFlQ@#iz^@XmaeT4wM= zlCXfvT_Vp|m#dNd_YNSnT3t#nVj}Kh4;o|nhE>b`9ioC0Zl<<@3N!MEmV)LWh|bXI zlAfD064t^pznW8TCIINnBIS>)?Pp;j?t)2Z-H1lkSr6~hjOiYDEm#1to&z7$8j5ua zR1ntpgbwH2qOoZOh`!r&hqcjbYeJ+KjkVARk$dRBr$A*}mb2nn7Lba@C{8dFR|dli z89GNFt}u!DaS$Gl5SGvb`hZX}3wPEIK?I3897PHvB4z7Xpwa@mUiA|x;Uyr}Jte~i za~n6GPmUsxf#V_uYOTAbMI3h&K?epTRhJ*wS>ze+HZYzXI91-snK8vTfXVOyB~KB} z*kORCBhbQnXWAh)URpxb5eoN9ctB=yBQ6AhzQYrQB8>Mxkm(&8xpbl>uT!Cm#iR;^ zA{NBUxhX{u8gh1I%mb03SRr$b<&-m!x=R};68!I`A zh1?Ko^bq2K_lX>@A*Kubp|QgEAb;M`7?k?PuL=ChVX&MENLRNt1LtI&Yok?1C27xg zW(LFeHHF3Tec`x(kM>VX?k8rE@n>`<>e(2qIFBQ2i1Co-jbk0lY1}E4_*) zCdBe=Naxtnq1y8@(&qhGdPBOyFs*XfLf4oj-kFoSS?c$tCj`4Q7GjMXE+9o*bDrS@ z1Q!kJ$uZ=zmWv43s@8XK`Yz<`6Z69vHq&1ctG(t{;eQ4d}J>uJj(87s}Tn)YnYI zC`VIl-1r2_ouU6Be~qyL1kAe`NnzTI4GgnAI6_Ewg=mZ+ek^@gdR2OveP3D@z2pA1^hA1(nE2+- z-rCLtO@@Qa*1+%YeQ4UezmXnDUoybL zr7R0A!h-GH)*cjO=GTs|oafN1I zQ9-PD;g*8p9=kQsN)b1V+#?KI1*kZ)V?)OdAW7k0ORq_vw+7tY0;}4F;rBK?$^Va} zzhN3d5U~oAw$|1t{`2PIA4q>Db<)?31{|@*p37Qtz5TKL%j#a<#M1E0fn_#y*!N4= zS?8e;p6elVv0$`LGm_ujP`EAMc6Yxpi&%~-$S(gsmi|h*D}4bk{z`ht2&R@mJJ^j;;@^yB@%Ij34oy60bVBKO2IMS^ z9@t-;VQ#hv>TBjPJhggeSLwYl8c)Iwf(qBt2vS=Z6Vy!En>vwPfH!`%B-jV(<4De@-47LR`Si_O|ne@R3 zLl!Z1@}4hA9YOGhb6oC7g^eJ-9<}mEFU#J5sGW^%>9AN67_1jAqj#V-&dHo6go8Ef zjX!(oJ>1{}zL!A^o*BqqlUJ{O{nN$gxciw^x*9I~Sb8G8Fa69nZ!f(o{om3F3Xo$x z9#|>4C0zP=cwlKS!kKye4VzaWlrBySM^QRMhI5oY!um_1&WL?i-iH!J>?`R*sg>^G zI`; zT!b^wrZY6um9nyAbC30$fIwpj{|CM1uff^7LZX5{`S^Gw5;DGtej_9bKZB+ffP@KB0rj zxH2BGtJjtB(Qe+YHe|FkV_3uOyrP4<9rdvKCn@)L5Yexo{iim;FoCQ;mHw;rJ=#}p zF20t2DE*m~OP`??J+l*3avEcQAid+e7vWb*(!Xphs(R>s9!viy{lrx-J`Ir633Ath z*srmqS2pQmz#`a!>{rkxGV4Z&I4U}^H=_bVYe8K{5i}lNSnu=E2p0dOAm|P0&!w64 zA#SMQB*6m7c`SWbdc%Ea5{w^#pFLvFD$D_9fJ~69)6`X!w0g`oZYpwe&Ww zdnlOT?WLMskzAt?bcHxBlH|T$jqZKA7V9%%|^blM_m7&CPm6WeV=oYoT^J57a$<6jY)c4S0Re=mo)!c&}3UxYb9 zZYvyjz-7SzP65S>%+eju02_m)4_5n~u>jEvl;4&FG}064yV4`+JJ^sp8Eq)(W9cW- z&yJ_!Kb5{ET}dyKEw(lwwxRs=NP73U7v@s>o^&OB%iys9`jP{>klvSm>L-f(C)O!2 zyohNcmrNUbF(M-5ykR@+nB22-q6iw0>YxZ1weu(iSG;gz32#&)rXiF@dM15Gx|05e zz%#Qhq!upM+tM2&AS{J8-;*}dcWoG4kRlrCif;IKMnQb?{{O@bwKJ=SEP)-pPB9I^ zlIV0jy}9s}!>6z(qiR z@LlPd^b#mshB`$}J4xZ&qk$+%{{InAaEe5Di@=^iLKEwstlvR;LwX1!FDy*W#N4Dj z6Qfu<+vnP9a0^(`1@`!WCvBq)5uC%uV2Z|SEiHS#vPwW7fL==POOjsX8wX7P8p#g@ znOJ!j;qOS7(o0roiTor-sgek9Cq+0BG4I+8xytOB$p}P9>;I-C=}l~FFpq){ znliA#oqGpxFvLCm=^k4z?9t5EKTy^djV~!#tucuOYcc3R zIR4b+wm-2*+39Yr0W8x?_W<3Z;)C=q&B)T8(!H?UfkH|bFUqauAjF41?cBB zdDH^ikN~5l9asAzY_pqB5KR{Z4h_lLj2U28lmw!wq^Gb1w`OVfsiPjUFiSkYZJxV} z@!FQq|F?VoDu^9+=+hUnd;aFd#a^ZGjeOpnTS{{$$Yuy^ghWostK)~F2p=g+Va7zp zA&wrOf$(=0D9o*;wY6ui(c54vKFOrw)WT#N!=gkL6JT;EMAMdOTvFji$ghlwVPeN3oa3K%}Y{-GupWx!10-V;`(og z-|!oD&oLpfd876BE1!4qddhgoHh;W%eH}NEF<+D(l0P?yTaER`==1`74u|A`f_=L;i;`a9a{e++B(BxnE;>gz4 z);>R{da$IpmVoIGVVvK*WJzf;V_aJBB$6_GByFS@ke->x@f|BXvvNs>KU)ns@tJ;( z&u#1fb1mUNYyDgM-|oozKX37OoLu)ZU6ot%zyn2-CCgD;iMoI0!e$X|aiFJGUTDyZ ztI;^+D8I)87v_MXN&Uhmi|YBW;{oC?rUkx)zvTKizv#|~-41g09{oR144CuNt;Lf! zx5=T;tp_mZj{VDl=aAq=Tv#FCz~+?-!E~_6-1OrHdN`_DFi|}oUj$7x0bLj zh?)6wrYZbCZ~cGXUug{|5RB2iL9ctDIpJ7(9~wcKuT9GZ(M}Zv#0X;~s{jW!(b_@g zH70S6JzE>9`!Bhc|E%@@g|E*LuN17`G48^_2%th(7sn^ct>8B=EOadKOcubvOr_>)-)&zmW0y-@5)k?@#m&pf!B2Gy+s~gLe?iE?`OwLh;W z(3_ao(mP?{ZUod5^z|CwTfkmwo|A?CEB%D`4TjIb2APO6+Dc;8?Fa_W`0D~nVrt2~ z2k-1udce8lgFGrSupC<-1&lL(_kb)!iMaTM4A8%dPoSL_RwyZrpWwX~8o9yNd_v7? z%`Eh(*<+0|6Ngl498Ie9-nBOLfL%AA2*Ld;{p18U&j=<@xI5Pw3Z zmDxm}zy+h5BtNzvndNQ^_I8d@e9C_Y0AK%`u|I=cGP5~^DMma&+@heAnCVZY3!f$s z{k4w)%~l-6kWmnd(LnR#No*T#7(KWv#xmBRL#axAF`7sIoX;M-K&qqfjVh4SNknFee59RD zVTq=q=}&Z29j|z2*H3(+`jKki;rAMNB?m&0J2vd~=9Bn&A5%k7|zOTd<2eBi21n`f~fHvy~scZ17wCBx$Cc4h=zkmK?&&&kPH{2FO@Enzc+LbfK(pRoR!{ffVTYasuvOPJf@XG8t@go|_Azj#64`7wukkO9=dW#J72 zSI7xVY8&u6c<7`94_NCR@yedGbxwl!A*VdtJl2lJgbH61$@iA6PZhK?J5Kr5)t@s? zcxeyc8MKtJ^^0RBXeuS$fv7q+zmt&`Jp(GPkMErNrykOLBy(D#Aa!vJp2f{$+|_0{ zKdB)U%5Vl(zLTxAp-Jj>itRf}@zGnoA<7zuuN`Vt!tOpdO^P1ajDTwS92SJH|H@c( z@xRff|LmBJpF#LIr_vS{Hz;{moV$9My!f0ZfWnM+lvK7{Kivl)`(E@VX z6T@Y;jQBjIovj{qL5i&|?1+lm{I17yF>>qH=rd;o(;Vdb+YXf<5 z)Gq242?u#Lx&DbUdyj}&*srkP*NkE8FCJLZ7_u>=pj!?d(vI-G8-*BA=GyU=DNQ(F zF&|LDjHycfzQ=YI!+!)TtK)YI)02rkNH*X#=~txB&08oIM~!@fQnkl3&hJE>1f(y!D; zbZ$&n*DzYHCLLHv!AsKDq%Uxk%g)S0dZLe{f0Di{eK@&ZSCC^!JnNFEeyI7=R;&Mbj(r-(DDt(4*B-+a7 zJjG+?KsrT6rT-#**Bjc?H-1I>kJ9f*ud;Za z?O#jpNZ-XCUmQoUoi%yQZ4}~~P&AjWrAHhw+FSo%Va+iyd!8#uC8vzl*p48S!bbG1 zEO0+SL8?IRDuU31raoy2zb5^@^c(olhLc5tew{k)cG5Es#VYAV>Fd%TO25j2?~snQ z^jPYoOICNoSv5Ce{6PA5(tnn|M2?bd@D-yd3+Y<=4xZqtuR;_(?n)1Aet0y{hTRTg z=AaUi;JJ4hn-slFg`Lk4UGdA2^>W<5GmEvLqjJZy%z%b&mH!9Q*QEbb`i6AR>{h&p zq{q@j;`rK^@S*e#=|3^YATacdQ>fCcjlacJ)}!_R4e9?aeF;%*NwXKG0{=um@E{0J zHvF;l9wP-B4Dg0?2_u5vlLz-p*x39s*%bN`UAQv6s4cj@k^hRPZ3y2}J@bM|L31<4 z^i}CaD_z74h196^ZCdtj?%Zbp1>~xZsNuka?aUIN;4S)*ViDJr>~)~O1vnV#YflJu7J_G7%3(nA_=^}92>aR+F zD1B44%&OaFc(3S^8b+8_Z(bn*+MS20oNdq@PJ|L>(}fPNmOD zzbSn~deIIR2nIq5Xj2~aS?M1~^T;yf|D5!pHcPUn7q1-_Olse}E7U6H1>`_OvaF%D zu?9VU5iQQqO0Y31K-is>P@PReaACmzRQjg$S)1!q80ObXcclyIAEh^fOA`T1+Pv54 zK)GaH9-xKkVZSB)z?J7*`l9sP($}O<5e6Pm{)P&8!g60{Xv!GK>TK9-X{2X|pfkey zf&8^-!Jat0HV3fdK%S*dhUp>$-a%}PxN{F+Qd?%GVYOy@{eMIHQ|TKvyk~KA)=#2R zd`kLmh;Ur|2h!K2mqDQ$;;sxUl5FHl($d%ep7eF;-%GzQy=13vgwsh2abMa?pCfv0 zdIu0(oM_=#(*>4ly6Hb3NN1#zgI=NrdPvThDm#cg;Ziy*)NW+JJ5;QOhPPm#U`yI| zVsoC_V;~}uuipWJshIc`Q)pV7^EGoYFXici|*j^=Tv?pe=ANG#ce&kfj;dxMLX053?K+2U{fhgc?4JQ&7Wz8OS=2-F(esm$-0mOLBc2-!f15^E``;gK) ze_8sE(l@13veUa@ISrCpB8B}tgS3QSkp2(po3Q7v(42Du*k}}4QsK;{Hto%4rQc=% z<{VY6Y@o)1#AP)+^W=yHRFQce=s$@%Oa?~O9iPnGf#u}P+7B{UgdaKKLq6hIRE*A= zF$HH1fZJk^qAE~$?CKkECVfTvE$P$Nv|5^>JuqTmK3u~lC;a>sGT$>hlcXee-LZ^? z3sAcASEb*RzJ!Q`q8A9@+-ikqlL{P#hAW=7B)LdU+NprCvPrLf5*1ljiQa~~8Fu?uP8!Vg0l~VBKh{z+uHfKE1 z(vE*UCzvWwu$e1Q;1Nkw7Hn8xMjd9bwk;fIfOn-YOJ9>-wMczwfJJG?ikH73ZZl^u?k<4USa*_R)b2{B-gj* zFn}gd@TK&s^hN1KV(pTkqred6q*AS~f3nJ7kiIUxP5{!H!Htms0l!q1(;RgImw1N~ zOMK2wt7`H@ETiyN)=Sc{bUh1Dcvg!O9+xe zaDNN%+7f4) zQqv4-#S0{iThy=^U0-ugSHNhYnVFrRjV>1UZOhJupe%Uu(6fm;SX!sMHpD}?+Xdea zI@N}`v;8!Xx%5DK$V19#KQXr}w;AOF3eB1HS?SBtD@3}PUEdjrJbV5mugnCiV*bk> zX<6}7QL1S?0-a%H_BJ-XB(1LT`|f?Sc!g*J*yBu-v|SFFfDLTl&9MPGW2K>RJ@(K;K!L){ zIl!#8us*YDVvr&CBO;0H@a2wOy^<~=WkfDKATmT~H0MhZJ||L1@)&sCw+X|DAKf7 z_sk3h^|rJ<>W*t5jZw}^4A_D=RMum3FgWgJ=Yu43ZJ=p>WUc}auO}nug>(;)uq87J z!i77-4gd7b$SE;Bw?mL)33AqWW`EdY=-JUMmySm*hnY>fa&?;nXehbuK!_G7N@vk< zX))oT5h>c10`sXj-$Uko3g&Y6Mf}1{&s|g0XVN3c6&Xaf&21deuw)C{xh?F)6>FiP zX!T~`gm^pM{E74ksBwYL7G&w!@trdyG&k5b(u|3oYiWKPE}A=4PI1#<=l>;jpjQqR~7)N-LOk#@JVLpDVi5`%bFm5hhd-lJjPI- zlVJx-RFYz!TDXA4ap*B8)7T?t@&0pOxI=_``&IhWZxPFU7&1EPaBG-l=WCc@#xgUV z;do}o3BMj5Q+~s>e8%C+;p_@*YnZAt>77XuazwxfaPL#I=Oq@e1&`m1Du4l=>@3^b z7)RrPBusNqTVruVyJyCqS0*(3$$562G5%t6E3$zNVi0j)aa@NAM^oCN2`w8DOt77i zt@;AaTKbS6rZR-fyO35!YMFpRQz-JHw3M#!#|^8{&_Qr*rKUatPLNz_DT{dW(*7yz zNWsC*N7LrsCKtul26;x-V>I}5fd0=cx(=heU^`O>M}U=sBr1GxHte9uET-KfTF*iR z!)~|a>6Pi(Y?!RghDYO#tq9LPCkERDjTB5g8FPUc1L7??nzXJ!CJCl3!LtnFrgb8 zl=2!@G?s1r3Z4BE2wC)Sb{K%d2JBYu10sA(@yZ@&UYZRl?Krcs_SYDIl1y<8S`pIH zn6^)BL5bK#q*jXS;e_zS z3VTmMVMpVI!C8*-CZ7;gZG)6eCDjd+_)6=u;uCIRXUKT^pi7Q`Wp9pf(IJaj?TA_ckOm{GmW^YCBkSk31dF^TGPfr2F(&RtWvU{9%Gd&L&NNtU?M6&+N#a zh!XacF1D6O_ zp{WOT&kt!-$*QcqsDZv6gsED3D!osNdnbx`9Z5~iu3p=5XmhrG4>Z!==6xd|IFQZH z@uyBQ&du7F#}1&diOk{XcQz!=DRjAPF@wOFp?^siayapog(njt6yfuR->xl&3rs~xrqmJ$|eTnTiK;K6{vBp1ifOLg9hKk7Uf;Z}Sv7*x=dPlBd zYXow5MmNecN1-Bo-D0i_TwiB#SBY&N-+8nJ(XSKwGs9>c2x+!dJEj*Rvy?+N8Yf`N zNYrmWDJ;BCzSJV~2T<4wL$bCf8h;{O49#s}O|;}It<`A;W-PtNoVvq`N<=`d3xxOD zBB7p!yCuOYDUn6<+QKr!%h4b+zyAS8k-(cr2*RoLV&$Y*;n@21Z=_4RVMhiUoo`EC zYETC1u$;T$;EqU~pAwawNSD$F7=oKmp8-VL8~wlmnm#0&o0~rNOh}oP&J3t~BCVwt zxonQ(U5v*06ad{%r5Y;nf{DllJ3D?5>FZnamLRT)EA(GJvcha9?Bi(zV!ji}ai{u5E-%L!#= zQ9}+*<=H5FYOGyURxfXXp_kGFkmQ~7cl_p|gJ-mYgc&+mUsVElYCl^t%=#&|qQZ5w zh+1ndwI1>48w)(Q*7G;FZh(;P?zj&{7C{##c3bnO;gsm>Y#uFTM z>5q_E^Oq3&1JSf>6u{o%;IaH?1o45qb@pBb2z$tH%~N##3YlT$LgzAbi6WA_@{4mb zy>oap=A!iU$F1Wsr{!&l+1@V!$ zJhaH{j`=WgF3 z6*7IoPwt5wS|YQ8c12yF@L zE^NN8j^!r`uMS?;a6azBM=eN^$1zBxX1T#@LD*_go@fo#sFBS(2P>2`^8FPz4Atl+ z7eAjGqojRvz0!U9$ANSF9|vRCGlR@_>%~1m%zEp5dtxY z!{bIfd&JI6m#^7i0a|pWB+py21{M1mO+8nBE^)_mcg!egEM#hU$Z#FELMUMnYa8I* z4Qs09WFv|PVSA#+0+)YI%|6HdJ|Vd6S1!YPFDzKis0`+GQ$(S1%p>lwm0L(I2UGXX z64o6Fa`cz?aKCe`EB0b7Js_-^TYk}n_lt;j$C}Of#WPcbltMol672_{D3F)r(t#YL zI(AH%cruCxTdV6ufR+<4AM8B9Am;U-3P&x&Pg)jrPwd-bz^04v1x369!qhWsV-NGb zkse9!kH#^zY)mlt_b5*o^A?vDq?DUagf$_XJ3Cq0;r;wMM4F5@Ojz!k2_iMb;sa)= zvA(pi4+#22z;>~a4TFrv&EQc$h(b+mZ@Z(ppfMTkCf7g6YZWHG8k$QD&!o;ggUnIE zioHlOvjNhNNG`IRn~N(VgYbdz+Xs`P%)0X1V-qsBOyMq5>2ikY1#qzV_UK zKkBxPqDAVM+E}|x&+C{Q#?jeQc>p>y63xx07elWaCVPBpB;G5N*08~S78T^i1HoRn zgl;&`ZD)0m{wJfx+8K{GgF-MvJa!hGP6*J#I<0eBFhn%wc|eWiX0E`O$m$&u)=M5_ zLGpCPN){nNK~>lxaa&>+Pkff2Qsb%1O!$a7@Ph&S+aW&8}rc+ z5@DP&6kyK!)!3a21g0@D8GmvM19s9=NIiR`qaeljC{0jsHyA3A9`A}VVoOV?7hoZo zOI}blWL57Af{D5H&qYtlfXD618-Y&zyCb?Y5`w+S_1O9c5pZpR)(#@zDIR{nflcwv z%j{4KNh)Y7D;_X2Bd5{xw-|5%cnG6X8F#)gux`KuNQwJv#63~ae`aHr?ie+yU%wJx z5fE6gmp8UMCq^*akDfn+NAUs^^H#J2cE*cJ?|(6(!AA`;7Vn&@+>8g_V|mjWpN&HI zB;+qh=fu+)e2>g3;0tKPQ!JnknV8u$hz7ll)VwwI9+1ImSjSsa_qjRLibs7iiUX3k z++pcHk|f=yv2-M)nY-`N9+b?)TvGn%81PXHX-WeQsFx-?vu{y*UK59usAN7`wy3sm zjM20tVeM?Jg%bn@4_{eaHOHPe#-ErQ$!`F`$%s`B{dN{Xv>1(pX?y?tvLQ3#OF5F1 zY8$n&AIIyTnSZT~IZ_}R5ei+{O7~9@5qP>F^zIZcL_xf?w@%P(bpL_^!W^TW@pDmM zn@uutJD|pA(lf%x7cB8eh~PPM7dH$I>2>f_x{s9)+D+J#4QcfpT&)ivqNY|@`7{c` zHDVud==Ed`<3Nmgjcz(8U`IRHmJFmAUA)F`&B=Xd0Q(8>${`?XlmX@xa0<&V=Gc-j znk}<{?i{nwIt=(Fk#5dI9SD8qR;rq4Cvz1lQMhZ%dgj=exoOIP#k50^8Z!htFo7t5 zW<2tE3!;3!voUrR#LU1aTBs=FZk7x^t$5**7%TyW}4j@NGmTYVbbQ+;#hqm+%z_q_I9)5VOmGvE-TCaA$Z!4p#2{B&SNUD|_ zOhYK*vKlwAH{Mo@Ze<{Ocf1arVN;^*qe6f>La7oLAL-bjk!{PqA7FglG5Y+$xG_Yq3+&Sv#=0F&*7mv8+k`S zvjl^NgzQBPr&#cmY-#iz9Zrlh10Imw0t zb+R@26fub~kJRg$#WONZF z!q1$wykiaHH=mk9&6*72snjwtZY90Il@&O9U&M}`k4i`&?vaXhyyJuE^WAtj-Nl==p8mW z$Xg91y%J`}8P>jcL#1J6Go<}M7a zoelll25&yt#+KMGsB1Heg9?j1W}_lx!C!yCj^(KFwZ+>@qa#c}C_$9FCRo^+PtAF# z%G$*T5Ks_a_C^*DNI0}I8q9j@B{NiuVJIFINQbwl0rU?2II%1~AChsSvc9k?QI9`F z+ti+cAzNxcGc)YlVJOotX>R5JfDsF8E-#NKnYj%K9(b@wu5qBJ7|AFSjbA=Rfajz< zGXl>!r?G?}T$0!Kb|Wb3VV|xT-xw9_%5t@4QhrWQ&Lu0nCm^p8gxo6oV;h{?Q-qb< zLOm?3J-k=r4u0WNll)MMf)^}E?Wd!M2!A^SX=SaM&oFOS)G5kQVK2P@84@;wE1g1$ zmxg@q^$3OPEbfacL(LFSFNR!0v6z5B&xRz57;6v>H5E1CgS9L4%5#d{ZKY>4A)MO2 zY>fxft6*dHU5pso$vJZ!4;L8GA960fY_jtSh#0LxWadFr=&u7D|KZP2zG1R=lum zb7ARh>2F{1W{;$Mgc&^>0mLI*;qhly#G8{s%ebA(5fHGGK9E+@tGMB7`eI5rH|MZ83b$Sj z)PlPBS4Z^=u%xAVIYpLvR&gYN&oBixmRe;*SSrAUyVtly3o~zZ@bOxfL)5(Zj~h?8^aP zE~IPep>)@%AjueRkG-d?5?4`!Ji{dKiF=li6K350<`qJJBGwBUXGS||l$r((Ad<+~ zq!U9<8-kH7PA5Mml63|lWkg>~igs%Y!-Rnvuo=7}A3U=hi?+lE4U2u(Q^+kk$*A;Q$HJ!+m7Tm%?&_j67gWLrd=a0f2*~8IKc0 zjxe&b$qfkR!U5*NogpKnwzb`R7TcNYE1{}va1je~!3Jh;KAHp;(tEHX_L@DMQxGoD z(`qt;h%(CF#8^f_Ng%C*2LlDpZFa`mh$N>rilXGr$5i0C0nn+^eFu+mfI}F~E7LwK zJbO*Tdt#$hOEZb3X=|T>)p(o=Sq~cM1{P6Ns4Bm03LR4V_yX!0>egbRWWGU@uMmZX z@WCbj9(E%l=s_8%rf_;~B$FL6<%tc7$Vb*c0w8x(>Dio=wlJ_|nh2MZES*CcKS=lR zinSFty@=Wypia7w9^>3j?PTSaoazoC&B^Ey8tmiTP>3DEA6Vx8c#4?+#I^LuQof4& z)TR_s=R5aaQZEtZ#vlmJ(i!69E!Dl|MuB?Ga-2#ccRR412Mb-NZE#0JL1E5(#)4R^#s{;H9Bx$gQ z0Uj-}drKOh*D%g!y5+xSK@ktO+3*} z5I!WJyS3FJGd$#C*z|v6lUCMpP+7>8+m07D2V?yF$@D&wekR?8xZN2SV1{`KH#l%3 zm30CK?I0r++F3K}id5|21)PknN|{D1a_||i4OFTp6*y%28iep1P%frvM7Lrj082r% zz7bv6!07yIQNVy;c39$PJj=n;pTIPC%n)pCFLI!f8cTLPIZUt_+Ikr&t%C!2Vo_${ zG{n%sov}Q3R@vJC!w08*u>Td+_10P@Dj0b|+AMB6PeQg@d)&?3u!#8I(hN)g@->yA zyU6y!S}qEhyrtDg2l&Mb%y^IGm=krEh)~Y7fbKc17zN?RmQ+lvXb{{RtzeA7Fep zOa&+m6dr#vn_LtLc54Q@IezleuMSJvp+I{aN*IG&x{Il9LV*)d^&-bHJzy~|QJ0!& z3MIaw$BzQEO=YqL9(-o`zpNvvoDAP&LD0gI-zXo&h8H6sBl5=-N(KjQLCEhsy^?ZY z+TkV}c3UiX+m<>`XiS5IPVZi_Z$GnRHEI)M(ULt;fz8atr2kxkQ|+CE4Ghq! z^iX=y=9yhuNucEs4%XK*QAFLdVYy*0IgFZ$#XlUYQCtwU+tw-$YcP+_YT~{8cPLqo zG1w6_$>v!2GLH>ty5?i)1L+JioEsL^(gjsoJkY;FO$@dqFN?stW;-kR;8)JW$go%? z)<@Xg4dv7L_h%Fkr}ej@Rd8dua}*W>+RRyzJ`!mxwFvPn?av$9`-584+k?OkmJn`A z-0cj>&v8c6>3_s9;h9V04>m~Q9!YPHjd6+n2)7oLX1Dk=8yL{}=I;m;f>pmHp6IMa zU^fbYdI%_@ra4=`A`Uws;SryABQwyU#?G<|TN^&TCNk*b$d2BcryK_*JMv_K2aKp^ zy8IT%*20SSxhdAcK)TXG++KmUG~k_(%`C9KCBJ@#85m#O8fYlE6iI=vsYW)m5pO16 zp@Aw?vnXqj&>)~iiF)8hLdzT8bA&wQ7R~N~xvDTgQ5TN2X-&#!W=0!pBONP%OLAqi z2ZH)KjB2RoZ+e#d56b#w!CoN<3jX4yYNW=Uj$kS$Tyl-cdAIU>J}#Z+U~!VS$V z2#(ss+&ppr$+7Y~OK^ngI*^bTP(?4DiCP$JG&4F}ZN=tG(3LX-Xa@QlxOV}OE_#5g zp{X8w-=KZT%BQ(S<1NzNAyiwBUUL)a1KmKm`S65NlF!shCR8SSu& zeg3kRJ{lg_l^>al$TZg;LA2D|BYD4O?)rk)T>uB|ZKmji0j;fe9m!kJ01G4}vV`1= z*n0q%)zB(zSkDHIO4Pw_zLDC70-p~GR0L56tXILT)rtIl4fXDXaWECXE?dGa8{R8m zau**;A4BnL?8xH?pBl4U5pLex;R@j%g*IL_N{P?2$pVI~f_1P3tBeJb8yqkn1+hWN zil&@}nU`pgn_=Duk5JnoR~f1^voNH#A*id#J_g^#YQ= zI0FAC(gS7>1O+X358T1!$n&=(xHpZn*l*?X&I<^ii{*KRd(;P!DBhwK<<2iY#-2{uzoc^w{tE|2mAH0 z^r}s-i!yQGUptFr`^`H*eAo_~5LyF&VlY8I3E_$utVX=HnDGVHsbd#gD?@Z)f6^WK(0Y(;R0*X9F) z2oo`toJg{x1?weZ#LNP>+!$%~Xz}AG0|uUhTs5Q|o*Yb|x3LOuOFxuemtH2>OafQK z1~@kqLcac2(mT?7(ibr)dz_|cJtEq96w_!!dA>^pyRdk5hf6A~k}|RWq2gB*YfsGX z@9l7q8S66k!QwszKVK6SoLPk;3V6K;?Kv(nfZUZaYJyuC=5ZRrx|H6Oe!{WNtwCVX zM^fS#*Iq797ynTD2kD;lvaMp^*THihTi>*KZ%RMpJx>w7$en8-tSz9}L=u*PbMLK^ zH3yDeNN0Ru4bkG@EW}_=uIR(quzw5I_=FXy-EuiGE609DFW}s`kd39^!y<6X0DdI> zH|bRRb>{v@`Q?Jj+>+h9j+WH8_`gc6^f~Eus9zjmkta#K)Ia2AXI;hB5b6eNPi*Sm3{-vE|G_p=7Tcq>619rE9GTx zOMfrDES*y#y90#!3=ZoOe)>TB*V2!r|C>>ety$uP2<3e3AbkwMXKa9ST;GxlA{#!y zY7V3)a|b5A9e%6D*USm!7f9QQ^bjm-T*FHXerMP~yCfb6p7HD$f{Io6@6tm0Jm*92 zhe>FdF3)CfHt!ATFQgOcH>8)4{{k((qNgNB$G5)zKal>5)Jk8M9^hr}k&Ok7Q_$ol z`WP}2!~?n4ITk6(Lt7BZ-Kdg~w&VnheTMvIfKQjOYU8*1^k>KHkmWGh*_(D0kpDK; zUt$g*S;>#3zXmsbReD)^*+}3U^n5SmptbZ^`nDua0KN+&!8}`v=n7(ub2g;?mj(4>tUFZ=rn9Z^MuG0_dqy=z4I6J6zRa z1i)ut>UW7<3!Cg1k^Tm!w>~zVu?kP6?{Id_7jZom8B(o{SyAI)JR{G zzG5PIjxwj`|48~L>7CKVkEQQ`Q2nZOk7|BqR~FJU>F=cP0agtZxMh)dmMI)q>fA;^ zdC=aofR^~DE%vOoI_ex0D;i@57tg41o=`yy*$yf|XM+K=V+UMIA4z}1?1@j?IB+lD zCTsJq^qzDTOIS-EOMgeO_Blkjv$sfg{{!hw={+huz4H76HLy=}KzPc-D&Vw@^nvuP z$@Nd4)cqB}#^eAwhkE9P=d0hTAWGK#fiv5tE(E|AmVwjbg+WF9#)O=yfVh?Dzk=o7}6;zN% zbQZ&H?J(GZ+3eq8C;d$NE9p{tO?pjwMY@A+ifYV-^d0H@qh;7jA4=cF47`ok3ED@( zwjWD>C%xg8Fz)=fQY*b7y)50OATwu6KbGE={!x0{PjiV+aCuJI)u=~^IA)LExf~6p z4v2eYW#&Ej#m>;bi6T^Pamm3PY(zaR!Qu*%)HDLLlUnSh$I=JVXAsl~pVCG=lm3Ry z8%IDhAg}(a{l-W#6oLV8pBvD=Wi_%EbK(&sq^D@cn1r2VP%UFnCu{z-Y>m;O?! zrO(rawKM)k@;mQHe=q%$n?B!yoaitrX=gU)ti_Y;u#nm4;_!?uMlVBJbJ*;=<8$dF zM2ZfG=!!3xRP!-|R<^eVK=FnNKlwm93_$dJD892T!jd zQ6#KlAj0Y5_oV+Jt)+LR7YP*Z!$NEcfc{bX(PS7GQd|(-XHb9+HbWp;r)EUt2?|kR z8WLss8uHLJK6vF+{_3c*6=leVWK}?zYhGD#=ZQVIzyqHkKs&?Gb{qnI7+t)@wf@9l z*a97{$bue8ACCs=C5d}Wx{&^!c8HF9mOR>}^hA0#sl&lsDF{E;1RJ6B89(&Q#W*~0 z(7fl^is(GsT3I%L#ew#uH4XRQTKQ_tvz!|%Qj@Q6E&W(}BK4{|V3I{m2cq6?hT}p4!htSwzo_q4jkEF-P>+dy`70`8#6qVS}%;-d8 zGHGEWP$Jv!5G}901v&gCg%4?NaC&<@acEL!DBeT?$))g3>6!Fh8VL_5S%AD-Y6BA@ z{I>K&`X@LBIjS#mnrG5u%2;E~dsDiQ{?_`RWJzRgHL%0+!xvU#?65wiDMbK@#V84i zB&x7?IZ!JLMoI9YccbIPBF%}~`waQHq!@S4#wcD*X7IiWm(sVO)2#Vv*(A7YtCkJy zPs$6vZETmVtw>2!`S@6Q!t(uqT2I3+M%=QIo=A^)sfkZbSR7e*X~r|EC!?0XoA1r= z&n2p!u%LrYL*4R5eFj|cce$Z`Gn4#{q3{|_lPyOt5ImEQjhZpivj`<76leob9A&cX z=wHS>a95zM^uaMnV|eAQ5X#O9j6&QG+kyd{p$hFTt$G%}cp_1_V%#rpfrpLb_j z`RPEnRb^Gn;0Tqtq3cmgXl}_`#OoPX7kU1||AH6Cl~F#-?KFhk!p|k8h5?FDn2A{ey^Wf+ z11DV#p}w1Vsk32NBHqYpPRQZ-WKKf9?eohop`K$R{$(#tuX=&$FYHvhE$GfQ6zhQk zbf%>(Qo2K852PZwF-rQa>8yid!^&?h{R#e5YyDc;^WWuv)B69jl&1!&%uxMQ^jg_W zjD}kK{x|~*r<20~$gFfz;@b;wy~W6lRyKq20MPx|AYhHgPyCClg8p|ZPm<#3r|u{` z&xQo-`Dhw(1du`ZjJiuO8ar#r-b@|~yvP0>K3 zGNJ~YZAO&XyYdcoxuSM`cSu+3xHvgZ?{t%hy;dNuwGA%{0QF3|2R8l7OW6NoKbgI4 zO9jxOnubIx!zB*tSS>8OXatP3`HE;}W|ZQEchc^GByyCd#UC%t+(aYK;osyIL>+!- zNltFRJBA!d)kyBv!wH;Bsgooa*e@Auu5)8S8yd|5cm03OM+F|6qk#m<69xs^9m#Usy zGPq`v2KsN{6TtX0=nomNTZZKc2ycMD1uL#2xZJUwYeq7p1-pWiFfd*+u|JTeIRQ0z zvwNc2J^=bR;s9hL2ZSahdS*-C>;b7P$+QNYkeTg7gAU}F_W}>GBv$IzFnHwwE*2C3 zf{B&mRW5F+aQv(P)WkbyM2*iR*-Xix#O*OSgHm7)JlB#ePT=0x@)&5PXKrBHLuaB9CcE&pA24{9wZMmuA=GN_bfXN)?_nl2e&p5y7`40Q-^W+p_J`f`uYAs({j%xnJ8RKxF(Y10+29pAi*Nc* zPAO|fm~#P*_1q2%idxa&DS{r+4egFB2rZ4p@Cgt^X8-$(t%6^6+x~s>`akFT=Utrs z?u78aqWvQp(ies$)T1OIg8E##D+#M~z!{%WM;wsYGZ5>R>cfo5Kj9@>L}PC4V9$Gz z=dAzFwTjt4^WtA-^OAi?6qHk@$W)N%!`xJMC|4d=rj<{y%@S#kk%}{XKs1>TI#CPI zh7O%8hX?#+R{LB|&NKYPB{+WVUvmGtpI`ou|ML?r&ZQGh49&^9VvtV?dP>%Hz@sT> zT#A7XG5xNl3vFq$dIybQLc)V|-Wbpy9%sRwnor08qDD7aENks<6N`}%ibv;z+mp_a ztXnS*SJ>H4M2HUdiZHZiLu%nS<3g6^yW{NXSP;ze%S5zW+$Mp_x4@@jI;TB6$) z(E5~U>dd%#t(P92BSw{>d`{$vDrs{Yk{uTx7u$lu&^%|HY&Gok@9cZobJkyER3Ud2 zr*`CQO*ARSPG^2`6u2sSv4U;YP?EXk(3vre+>zj&+S=AIvN8j$v!BNRl*0m$??TA` zv<3+Z&84nlXs22zK!lk&r?I*%`d0T^~E^cqT*fme~ zFWvKZUg*|}exHnRB^W3NaGk+GQh&Ub?Fjx)X0~+3?)1tN)wdj*wX@dwjLf(p`W+7e z85^^d!oOfgbIHEm(8vEs`tZ0ND0#paXlcyMq=xofOFyGsV0`gPdP#c4ng~XI>{|Lj z`r!D^GB>xfA*4bc3yO$4y0Lqy>cF7GtTq(8?%CL#pg0GqqEDf>b{3Y03416MCE?f7 zD`=B4cqYBZQ0OoN9qgDim+3-!Z*(S4y7*jrMS6IAY&y{Gnt>YE#}}VSuShR5FGHA) zhF3@=-S1KmH8K04k!fB18frN%&shGX6uer`bFnMwDMoSKte<~WncX*((b^U=-)!necF z-8UFIm;sK0On(x2EH#3j8lx+WR^mW@IB9Wl7}L7jHMZX2pNBI1<5eOY@OTDcD3aMo!>dA*9V(D9AgA` z6b4Rhmi^8C4J=V6ol0MpzA1fCdeySPBra)U-Bam%(qBn$(YN0V@E4`ulD@<+xFm_! zEJGtbk^WBlGwDsgIEmHgq(6|p!Bnh>RKsPbR`|b?-Whd)g!L(SixTdw4@BHcPo<9t zbb2tpwI-<;=Nn}I!os(F#6y?G1wvM3BZLk#MFj_JF%_TRyv0iTg7l}-S81$W*l3qW z(ql=|Po)1U{SfA3zy4p6{#g1lefZ%6YW((N=?&?>OFtg1e;PmW$I{oO`*xDvQyXje zk@P>Lzmneei~SJ_q{O1za5s2IXLX9LMN}fxzEdiV6Ln__-W~Pr7ue-Hpr*I zfL}<@q;E@qJ~Cb*g?oCHSCpYr87$fJXXt-Jpx*a#<#cCU4ww4S5oGv<*u15m}W*i@TYA!lD zqQv6wk0&6E*8fEMob>zBA4^}7ULt(TAgvyV!_Tary_f&bNWU%piS+BzeWRzIQKU!) zY$m-Sy%(F8*8g+Te~|u^lLRZ^gmX#Kh4fQtZCtVepE&ZMXqgD)Byfr7Nb>H^oBxY` znaV7AX+IPs>(a}Yt~`Yu*;Ik$MwiBc8+5l-NYW~NM*2hP-%FojuJ)yLVn>^8q*w5* zPe$wiIqBa^|A+J{=HXnbrAPFsr`ptu(i7>!I36_J`8Dacq~DW1E1d&k)Ogi`Q;yH1 z$8Nkv!k?HT=Tad(kshOhE$wtmpnyRJRublS*!KcD-Hu{TM`7~dQk>Z=vO8c)Gju8L z6V?3d(KOxO0aVf}((g(CLHeroMOD0B`lTdyFG`=0{!x0ImZ6eflzv|0sRmVv>OlToP~3FkQ8EsJ2vT z*372sFK)SE#bz|{Nh_kB%udN2G^5V#G>m8km;;1ut*de8b38{f@KE}7>35{hGu14G zVk@(o=}8|*uSloTll0&->2>LMrElW8D^i(_6;4m2S~{0rkWOO#(XyZuNwFqniJK`J4`D3ek|ck zdR6*e=?|nYODi&%Qv**MqM}!&&q#NqkJHv9%lC)Ue_{&O9X!r~&0CSaZly0s-wEZ3 z#gAE7Ep+iUfmzC39)eL8qeBrh{^$&xwY39qBi~vAi(HSqM~D1I;+Grv-VhjHSc||> z5je#-UzYw@`ZdmF+Tpdrh867Ld~~GZTzXCVHR*G9sBzR(S`1_K=Js7ck@o!u>1(it z!x($bT++Gpvh>2R!KYt_26J7~Rviq%o%B>{q;u&G{a*bi%4c(^>=~q~nFanW=<5K$ zj_S56N3;Sgt*zj~diDmNWN2QOz9xN!*nP%N?@`PXekBB}U&60Qzb}0SDXFZ(aKl2Z zp@nC$dFkRWNWUR{0UmdU51jKZ9e(1A(hpf69lu3p*=oFGL_izqW30mUXq-%Iai}a< z4ZxsfnO6)fo>SY%=+ z-;_R$-JFrNT~ZUxINu|YF_(Vvm!;n%w7rl%wkTo4T3!UXtL|lsPw~yaBL=lgf|RKc!#)PWmjPP`C6k zM3+sC0re>@H%s^x>6_B)m|Gd4A=#1*(d)H=13hx|r_ygr3n;RIBCc$bKr& zTwCcWqwSX#y>-Sv%)E{r8CYd;>&%J)d$N>*@F=Q+H^To&dL%t0yBP66YU5F?N9P%o zz6_t0zJRkYhNH6*`6bnt8bh^v?Rimp5zElg(kA?PfGRbwR0jZ-?)({}j5k=%v`brp zxR?Vl9Yo}IAWi6C0}XS7O1F%ZJB)U3fe$Gyg^nIPpQSQb!{~51n4!9so=TPUkZR%v z;`>AaAeZh)UzUDd`V=M4+JfVSwK?UZ;a#cr_<89I;8!J)LIV~xqoXPx5;Sh^{1xe| z0CrJzog=paCRRlA)31H1m?NqZyXy5ZV*N@m5rZ z&Ww6GH?5u>PcU#d_-T-b!Vbb(5Hd$k{+NXooA+zd%fsPE8%~8>+F-(!wf5YQUcVr{ zF6Gv?wL~s!?7;&4n;;Q{-9Mm?TO(}c!$~#8=sXEwqOYiti}x%6LN@&2%mCV6y*(oc zx@W`tS8S(l8^n{=|FhDkF?tK)iBlVO7CC)Ts|FW;mb^YVdnK__65izG1{iv}Lt8DI0oxYX(r^vI-;j2J(?Zr%3V z4q2P}61&%6(qhip^fZeMp_1}zbP=E<7fQnLo=G`ZBN||#pc}y zq02aMFQZ(wVF^VMeWH`L1a?>x1D)cAqpSK3%YNcXc;K|Byqk~Ex?^Rm7_l-|;S7*x zZWA~T(#JG|-(ke&ha=M$^1q--?WvRzG&zP{A=j03M|ucIezVSZU>5{i$#dr!QCV$K zNpJZ|=6s(buo14tWQ_*9FlOOsh)t&VydZ<^q$lLmd&`XL5fUGdG$%e-6HEm+HKNi% zB`ey#4y3trTzWI)wsym`$Q*!%Kifhos9C@{6o$PC=0JNMGI~C|bI3|*^yS)WQa3sf z?DF8P<@kVL-?i{B9#3-M;^8+O&KuBii%Mr^x?>p?k5FT7oU1lBMoVr$)?f`YeDoa0 zy|>wur-WSBHil~=f(XE-#>$o0^OU*0Cw0;T&i&0i1Iv-}8OAL`wr50R7qIcWV;=MY z)3LBKHLelB^e&I3XVOzvYWn%(S26+x&%%XQnfdALv3Wu4X@Rf~)azUBF$YoWUp}R8 z_u+ih7XB4j4{^6phVd&zjK9aFFR%@P8RbNVJ>p@`y?PE01VF@@#GSX(iF@8W}Wv4k_bH_ zv+rybaw`msKPAXdVSJBkg1$R8EV{5EeK!wNmEV(QOp2coA zJa%V`)jzXh4Ff8Xo#GR&I3sf}Cz0$DjW5g?9}va@rM+en)t~}ztwEw&ilez#Fn0zC#2zki7ipdDNOK-x ztiT>o`X{Hcwyd#(ym5wJT#ka^o%D?2RA*fi1s8W56Xt6<_|yck zdsH~t$F&Wl{1{8NXE_Enm=L7W{P~jM9tCIoJsst-(H!heQU?IX8HB+F;Oq6I0qv*+ z+_w`aTV$!?{-;^qH~3A|c2}FVNZu5}6LiKT8bbc61)y=o8%lp|_RA ziy467%b3xL>q8hjb99;6`LXNkOaRkTElBvT$-fVnzCj|}i9D6axGY$iV+CjkQ%@}3yfQZuMcfJ+UB8Mg z|J1Dc%rKi7HNn962KZ5J-Fg`@K%k@@4zz;?TTccM1%TrU1D9}EE>U9P^Ik*0b;yJ>lc)5nUO-|x7 zI}QMHAiO2;!@}OVK#My5eUJh54xnMm)0L4|P5|I*;IW0FYK1>TLC6qXu^Je>MWs8k z_(2L^n#P=9U&>)Odc`TVtv|gB(4MrrGI*pmkh>$NsE9@eI@pIrFkMg;tXXnNWK%&=SGv0CcAPSoCF2?~#aS^!hwWoHP_0_jdWx*Ej*4e?_1PwhaF z&ft@l^dwv!k5UyG@C($CO5^NBCGM`V0eTim>|8)7b5pXFA>kR+lm|fVgOytv1EeFE z%b-Udh)PaJgTUPi2)7jV&eA%e0|Lp8mB$@GgrS*{*#q1tj#q90V>x<{_UM0r$^m3` z6hJ3TP-~VWLXXNq(hM{Cbo3xCdoBYAcCg5VKe%uFoZMd_Hg=05Ig#sNRxBTRkJ1fb z3iI#UvCS2Ru|)=g5_cHAzyXcFLzvoNdS@2<90($M16mp+5kW`>@{mMg$=5V#fWn3} zmOK_F%yXs_my;1xR4{jZ^)deFoCi6WSKE%fM?;5CqOop_v$I2JGK)_pBIp`yDYL@U zmg_IM;Q^?+^^04ZdXRF-g$ZippZihyQvBrFK%&rr^j^=vSmwqnnxL_|8!r)$5=m|h z9&atj?l+<^MVV2$NomdlmbNnQnYr~1r`@r6o%8}D(Q-2XXU{p4HZ#wJH&q8BZ$OMRR^M8-C&D?`B|o(fYkHQrwCN@KXAiuzdWqGTdA_W7#s}pC!W39ljGm-{$z?T?>N?*kOXIq0TR*2c%-X z6=;uQx(#c&#r_s(-o~e~#tMZNM%>z1XH<#P9(#|{e`7}LVMJ5Gii%QfXisc)?>%yI z&bdY73x)_r)Vu+kyP;4N(y8>&Mp)$3Wa3SN&T?v*-t-=blp_Z`Xt`i7J@fMEXbtz$ zQ|TU0-f)CfVHx3!QpF(o@9^|Xn;El&^CEMjd&^@6_b-ULpAxmm1{^O`j1tJ9-B)Y)cT5C%}*NDeyimcPQ!p4)i~R(Xtr9xVg1YWE4xc z(npY=9$2A%$Ng{8hnkooxSnTZ9R+`w656}cfy`(k1_UrSGctvWO82Tnei=Aj4^-5Rdj3?(X?L{R3q$V)p?^{yRM-ja%JeWFr? z|0N`VibdYB3p3mQY&rvoH7GFA4dLsCx^iK8T{n6LS++Ol7IR)StkR55lW`4T28~Nd z*@jJ964Ra;LOb!Pi7vK88YTRj;3X9V0o}+FwpRJ?FkLMgyQA+i)p4I$jr~ST-1i1< z*`a!S)F3KwhvNun0e$_7sIR4bQ`pjM*tVr7(y<>wUMZkGgfTi0R*q?qk+8SsMnt4> zf(>eo=Q`nlTLPruf~>IBQJk#^oR_YHnMF#CQAk?b)YiBVPo2jdH;)2RlEUaBTd@%* z(krOMMD&oG>F;cX(kd=^g-BEeMfeWDB{IbZT{-|EMeQ&1QGFR`)tOXCSJG2E!!ODZ z6%2qow@tDOt=zNWfmyYLK|!&<8yrg63#ZjSr&t{^X;d{1BkTJwSmA>a^lHpO=bE!eo?`uy3AtN;oxN`#DbTsj1hI0YX-aLuv&1D!HH zl%n4HvGlq@|DmN(dbxun1+nhtX_vq@CHAO>yy4DlWpUKl1~u$r@XjKF6$`urOBh!Y zya?X1Jvq$!GtAY2y*P|+p2{s3z$*=;+X87@v*z`vdNLBkD`szmKi?xTb2C7j$t`Sc zMB(0YA~DCKCBz#<(1Bb8=`?0n2zxXmf?iscdgA66%%TbY(SfOCCFtzJW-;|5gKB9agL~ z+E;YOJR2Qo6OR`pnV9`ikvQ*JoyvRgN1x1s&VYj=v)J>b+Yt`XnN(JW`6UkRCC@(x z(0DegjzMX4#(#i1S%9whb<~mQ_Pt0pYjww z!jFKyGxW2xw8Hy8_QwXOLrb^P9V;cxu;;aPWO+K*KmUgIwmA=#S;;ba<$E0KvGSd- z`0p*DNP>HlGCjnk9)?b8Y(0arpBbevAgBb>d?4+l7cdxuX5LGJ$SdhCRE?D-RFR*Q zBLf)NOz=M<>uWK{NII8Tk3=nbNGelIuD>keXd!Ea%tSyMc5?4wSZ1PjX2aytpIpP`TOr_wQ4ALa z8GnQTkLSqJ!CIT9S$w%|Y)0W-_Adb~-S-Se2GQ5Xc6t>=bHh8v*LoIKHgxn0Ya`#I zxf^RI?ItrAS*%NW?)(Guj=OdJsEfrv3`_xbTS`V4}_nK4E)h$q3HTNx2> zAU3@j%UGNY8OjI?1D?C0$!%Q9%&p_`0Pm|ZX6+TybFljj_PR5kO^F}eGLa~~_i|K0 zi=UV=?=RZ;E`h|(7^|7_@0VW3&FohS?sLIZ^rX-CP~bhfG0_DiN;VO7=djvJ!_f0d z0~$dd?N3$r_A&F{F9jJnBNaFFl|KHTWMnRZo%1?QY8; zQ?m6ydPzC~|47~=MubE$BiamaE)L5gtn|`w=Rl{TjMgKuk-U4MAd=}BtIllUc^)Dwgn26zm zWn2u|B#n8mSooX&%UE+!y;|{HH3>o3_K45Bc-tr=l{U=(fGs(*V%Tm%A>5u6IqW2F%slF|J_+ zFi=yhoSBdpruqj9f^S}aOSrK^bC$MkGk&fxPdt&nY&jq(_%k8bt(_h)Do+hQ2aH>3 zIqk~2p!No&^f5rkti>}MieB4lH5j8nPA9_u)?RpKwbR(b#;^>px%J*}VI5H^@w!6! zmk`-z(#2^1qrsrDWT7?{UKob7O(`H z?O7Z#wj#%0H%3IMFf0k`y*w_WcBWrzTt|3<;L^;9fSyi{BgqVB2@dIDnEVB)rjOtf z@9;|clyrivjJAl%3XjdO)~M0_=Q!)Y%)$UwM$_3_hhrA1yEn@G7VRs!e`ng)j7AaE zG^cKjWU@!ac4o|Wqw?+O6BMpf3*VO5wz>2&PxqF$v*Z??HE7?K%k~7sIg)lh>P5>j z8c9w9PK<=^1^%-&4(5ac?>UpBu-6vS8jhv&m6uiUf&x|(G>L1%P6c6iaOddlTSI+4 zwMhqsJ>XM{h~tOfF{$Hv*t`@WS%X(Tm+rBIeG)71lgCodHv-5F2<(JfWgos4EdCkb zymwQi|2&Zv(uMT1Vb|GU2{u+Gh@Wn819SGjwpNgv=MRF#g3C%v-xju()#O0WC{53J z)tQaIxgKVH*`)kuR5`9daa)_|5uT)jq&@D0&ymm#`+rIdlH@v^Sw331t@N?5X&!g-J5Qu1v_HIzCp!?(&hezPkqrswniHmPQHRcm3pHGht^0sTpBt2{ zL(Zb9?O@;89UD*)2R@=5v9eWIvT_l6O-utRcG^eq}yv(2QIYH*P_JU*sE|S+~L%eZBld&??^96*kF5RCv!u z_H)n&FsF!m&Gh_*)pWx@T}qFwHRSl*e)J82m7iz4*%xh6j9@@QGf{c)AsBQ5}(K- zBi=}h7vz`Da@^d;g!DGNv9^v zsCERwwT90gSkZzAF8T;|)}Jt_;4P599b+*Q1>~7Arz*mzX&S4G@Pb%jVK$_)K7#02 z8I;mD-0HrKE!r52UhqR>{VjR07XNt4R~vS8gZEocicrbPUkiJxJ^6kVmjj@^^k0b@ z@3oa7qoy3-K-5%w{5R+?_pG!Wh05U3$N!GgPhSPuq?e!qTr+iM&uezl1zffs1ld^K z^ez=k=Vo6b)a^)w9UF`Q++#42*1EC{9e^wgjF*-HNJ&DjIRhSH8JF5kpmzpAfOG}6K-{90}W1%>)V9_ z&rQ}Et7E2;#(fgups$!;A5k)t|0wm0_A1DC|(e~cL&+u-0#WSFtFg+HwwWztswTTDT4 zfm)>haO7(znlM6*PN3N@4gYdhaMTKS$3Q?3VZ>L1^jjHoOUI*RD=QZ7t(jP4syi&i z?zRjj{mLCXePWN73fftE=HQqNh;TKD*Ika+oS~6hWF#&wkAaBj6xpG_6)`|s{A&)= ze2o}whvcOlzGt(zOM~NsgS{J- z@dgl3@S_Sur?sB`V4GLw#&L0z+T1HW~TuN_zPJ=k7nhExC>>Uv#F~_PBEJ zmH@qxNVX(OqTKDjoOHN zV30u7&YhXDVuhJ8BNhvh3Sj3(gc>nGOOEnx`6tz*j!^T+hRy~+O#0H(MjTiZ^h~&6 zZt>u7@x?w1B@v*;hK2lC`cOLG7ePz-zP6M(2#iS`r?zv3Dp=bjwk`W7=#t4jOn^Xh z*Z&rrEokf&t5};SiKeo^??!5nlXZyF004jhNkle}++=6KZuA2NQly z7vs5E&YYR7fixET4tP%8pkP?Y5n4Dmi{4dVWAk^95fE>RhIZ&(V}Cfq8}@WC_CQ>- z!%EMQVQy_gm0PcP`UybY)_VKWpB%H**M1tvK!H*UbenC|635P1%o1Qv9tQ*yuQsmM zmZxj6XA#`BgWX>dZ*2iETN{A3Ha=dBs*VJq!9uwKA=zzhZYgk>m!1!*Ei}n-BbNq) zPwimA4#v#L9}~Ingi!35&2Yhj9}rUw!#|gT42wzsIkU>Bu&OtFo|vr7V9vfx-%j4+ z7G=q~fzcg>YXiw*#ij}6iF>Yy=6kWzK>lHIB4vpm+Va0q-|w_AsaAVetA4a}wq# z3(K+MtpackA2PvgOpK{C5(5QuuC_Mh3{#n-jtzdc*T>LU7hI@kOVKoFX33de?jNU@ zb1wYMK#*YKMnajic;f+3&jD<$Kr>~#t=XXw(_NZ*>@MEf>1Qog{*s#F(h|gapaZ!T zr7BD0HvDc4vwVgzA7K%sMyR{e%MfCBR5BL4@i9?WLo{^tZV#n5r4OZBBsgJf3(UC-5gB{~Lq? zA^F@itP8e+?G#fh78FivG`Sf7s4j(b>6g;S(lc~SpYxv1;oTZQ(+(Ez6X{i}<7VK8 zhuFgn_)ixBrBwZ{^h+R+S)akap(kWRa(FZvLAM5G7j_PEqy;VGq?Z<7ckds)o(cIo zX2(t7=SM;`lwU)O=A2!%K?Nq}NrIa?{_rd5SJK8{a8r$!L#5t$n4OnOgx zM>=HWWNt%wqN=jNCCfvPGgkfF3ln_K_B!r#` zC-~dRuzA-;mR??6pla zpLzP*a;)!1(tQeWVaXGzD{_||uDkI&-vg)GGKFkLRZ@lnG&Xjj4*|)dwzZj&h6rLs zW8)g8Kr;|PGC-HyVn;L}*z`N1?$uzJePLQY2SQl`CTB!iORF9XgSaGyH>K}Lhs=ST zT6fvbD*YP(jw&ubm3}7ulXNV-VikkRa>js=&YX$YN@vo~r0+Ib)APT{m=b4O8GG z!k@-{q^%uvAmq zbP0vI#qVlg{)H8u&nUa+ZK~>zOO$?gl|hfT+&@cCNrm)z={5!=Gwwvix$ax< zWKMT}L;8EEmR^QEx-u+jgA|XjWJ4G}2kknvlJ%TjyoJ5Gur1W<=qv1`<&qfm60z4b zS~PnYRkyJoQ$b$CsEd%lvMZykK6C)*(p%C`q%Qz71mUVB>%U;96$AMv{r?Bt5lXqY@-?5VP*;&O4+@9#{x>R&keTj#yT1T^O$f=8|vi z^gea2{vSw2FJOH(cEhy|$axrZ01`p%zJ=4`z9s!9=~DU@{EZy()Ht(=bS`}$ zy*CPjR+e^_INfNWSy|#VL1#J)H#2zaA)Yh`{XR7dm(wrQL=k&JCvZi8SaFA>zy~bK z<~jg6iSPsIA2_+{B`d^ZOCR~ctV#a=1)04@n!3SyuZ_)6j`p%;9P2yi0N4_1Mt9@}j$l2w zURb4&-vmVS!suWVs72e*|DfR(@Q0?BZ%hoA%Z=jFj>`X^q*ovkW>(F;B*IR6;XaUS zr~e;H-;-`I{Je&_bA(GQh&Jy^_x<9v^p5nOrG@mT(vqO@gb*=y`xz&)jX^-3LPZ@o zxM^#Mje#evqLv@kr3Uox2F0e5ICPD=v_^aD9Kgm6uK#{xB_oa7Shf@&rBkblE(1gmM|MYWrrSC~^0}5PSd?x+E3S`e&DKEOd3+Yt) zsq_vLo_e34AqJ(fBIH;bsyQP9=wZ0w0OrJ4A*Tr3P5|P170jq07r3wJ zq17WWdSbl2CH<{5k-j9|G;)4g)Q32h??`XB8%z<@&!|$oX0)jmrIsT#>VDDT4e3|=4me7&Yx2cpQBFBxDrX>Q1x(tj zqeg?p1-qz%N)n9|hoHOLfdQWoUC-?dhDehq)(}%tXBlC^CHenU`oE<{dY-i8(7L)S zUh{|259v6$y7)qRSNg7W#5}^S<=&5_dr~WXB>h18Imvlv`F_ggIpY3zY`#lE9zT}8 zBfaJF@0sB(GOVG4OmIID*)Od#x)(%F$xx5kA3NI#kEJyvr|CWgBxet`*3+KZjEf9e zm7~@_)Zl^iLqM5?Jk9%{N+J=b2xvRF_`dXA>5=pj9y}x8smUwvOYccdV&MaftDr)Ui4}( zr4j#wYd7Vq0h{ecV;`47a-$d0_Xy}5F+BsqNEukKl%Kk=z9*eZFTjIttTLH^`4s-Y zI}l;I_(#%8`XQW`=ne|lD5ZbD7|j4`V=WYR=)%Hwc#d3_0~_4pWlk;AIplXUGJ`Y9 zBU#iCT2RF`hzn(O3$U;2)83J@1jSx(khOOK>? zrMF12I}tvTek@%||4gB3&PxV!@xJuF^xppBJ(RxBK!^!F4Rf%IHQ>PegUwU=0-Q>Z zm_)uqV`kPRy#=RS3=T$%B6l!WM(d+E~`H)~J zD(fRf;4O69%9mZB{ zq+l}(O>csvM;siwG3;U*-2;~d-kF&ZU+v}zPW*x}A+BugUz+PK4${_&9TQ<*+`Jjs z4c`DIAjc`Jy2wS7uRP#S);NP68Lt6ujE?03Tf4Ch82s*@0jii2Q`tQH=$XnW!q4{` zNcW`>HyIUvYdhgu`L8T%$ojwjB>C@*H@AOhzka*|AAHP;rq2l_krq&}AfWAOVOZ(V z))4+70z1S8^~&!V`ai+4H_ZKy#4Ta2xh2=xbxROZSPeFrN#WNLVLaV+B7D-t`QBx| ze%Bh%9ktiB4WL{|cZ^S&;f6aime~;p0sCeAi!kp2eugLO@Y4#y+sq1Wb80;^NSo)D zd+W(>abY6p(WVA8@_`5|X6@ysj(+iz^f!WbzfxwuMs{9T1RijoBjkvg25Tg_*hdjs z(vD}a9<@2!g!!D4rmeh!oZ;P<=DSJ=kn8?v+X)P(CoO=+L^QBLpK`ITcQH@8xV0^& z{u+Gx^^pbdScqW1=7bU=0(itJ6~j_`&`Iu4Z;Kw6=t#O4H~`OogEyID_o9v)sObF3 zK0s*c=r=w^_}5*$ug}+B{Ny{dPq;GLmV)Q8L+xwpCzqWs)2-YYtpNu%KRRNKnxCsc z;f7T59T{X`8-=yrw$wCl4^ET)P38aTOVF79{x*x(UT@+*W%CH&U<#EW{mFqTUm$Jk z{XLrTrJy~fU}#M$HM6R~kj2;%ac-EzH$&Txjow;bcPPAn+`eDu37;g0QUCDi!q=Go z$A6BRBSR4)?az=>OH3Kfv_a19p#bUsQlD7R-7+R)uCeMz)E8IMBk!V#0@2E(m|_SX_d`^5kJ^v`d6!T!a+!A}G*)sk$j(9$JA=p{a|u#q|4^*-w4kqv{I;z=rY z&j^Mb@J3S$|R>=G7A1(`1(^_~bb zo4=E>7J-GdWbRSE?_s!=%_53I*Ub77TYCA*U-KP*C;9*O*Z=mH6@>gt=!liYKnoZSrT5fzxyvXJg4L&Y<`J%Por#-vY?aPCa1z@SB|Pk@fI{_u|Ilp6Ly|9*U& zg;E>fwPx$5N=RXDpqGI~!tXS~CH$ictC9`Sk**774X_vCYi(ShwRM|Tzq@()-|Z7P zfmiKvK%E>xX7B_j12d3t&SRLQ!f$U4`Dpf&2vfDnil7g-ds<&VD@LCu<{M?yTn~ zvanB&K%e~ApT2t8Z+-D^e`Rzi1>G|vUTJJf$9j|pomd>HI)A%=Fy!10p@{y3ULShX zzuOjpZ>$bpurz1;qd0yeLHv&tzJBYK&ul19V^br-Za1`Ll!JM0GW36LanFK!XG&Mz#Vh(1(TMZM1uHTrSLBx|4;p)@t+7!IPXJ{6$&rx^umTN^zKtJ zO4l9uoZrnMRXrT^(v@%?ryw0Me=Fg7x1?h%+HaQJ|I2TIf9cKBD*|6y1tOYWTQG+j zkbY(L-R_rnR6tV@(*WT_6+5Q8_C5v%N=jhejtNk!oBVexoD0m>}a zXnviG_koCOi(qlFvs!&UT8pelK9T=V?KxW@punb=qs_b(hr%!59_JKyh3j4z(y(+c zL}7*hnx(3#gO)sbkmRZT*3y=Np@l= z(V9@aF?haHkjxl+wedV;1 z*Z;^D!tTr3CIdEYj8pL>B9IKD*so+|mRw)Ec!SN;@PINce$vYCUw`e&_lm#^ti)qH za8!VE(6h#jQQ<*+nCnY>hK7JLeWKNIHi$rXh&aVX$?+cdPgsmz{qo7$pUMq~+^wLw zeuElZK&BfZqfO|Cp3=Ni_bo|lb=2X-Q}V~sR&t6B-7;L^eBXd()EKAMI$ha%hW9Oq z@`uHc2~*9F=-Z7Xs$u|J!O`fwn@84lVW+mFgc+a(7R9EPXgC!kP4}^jYa84s(bgGTILw zOYcfQm+tu<$T<4(v(mFxu8L;Pobutj%#o-DPabM3Onme&RThIpqP+G9G@|AJIhPj1 zw+4q+pw*rJH@rz{Y`2ttY$+NP)}S`*Vj4){S?McG)ob8hh-Xd^-aFDSr27LDC6RAl zlwOu@kfw(NN-FS3dRO|z{^Cufo6>92>)-`bZ0(Fwo<5e|kbWxN-Pivb%hLcpU($#- zv$bq#)9Ovr%pp)G8+gH@BwEFx>GGWLuJ?%mw6Y!p@pD1HXzf6tYxVyr>5I~9R24Qh z&?T*68XNE<>63jGzAb%LdW}Ai%mV0SRPIS{NN+NtW4Q8Pl3tgdwr&VnN$E`bK>CsN zVcfZw0Tf^fE%CyNuIng-$5>9e4GO>_oM?{gNS{li(d`DOvFt#V&Mn!P;y`9rc6Vzv zj%VGFzAF8Hq|ezfrnH8?l3MAObSeEeX-Cg$=L&C0UzYw#`ZMVnCep6$h?SHdeGjy; zn?_JbH>Iyj|3P}4s4s%32DyGH-LfH1!#F&t)r9c1H1bVA8-)c|T_A{oaH=Y zL}@LZ<5)&~Xe4nH!-6&}_!JL69i4*^BD^VmP5Nu;bz{zC5NXyllfFS}eeUvqTKXgD z+tQb%6L?W+>#a!TPNg4{uJmBs+tSyiZ%Z$*nU8HO<3u`_K9PPP{TJycerK6tt;sPO zJHoyt2y01ydniClf|Q}H!f&@kLK~cMw5arI0BamdZB?$|phix!VP00Lu}_l1H>Cd{ zeSwEMppEw-LDsqSLu*RBlEOrKM*6DsHR*F4iEw1icPT1+Ed7wt1wH-0C4Ei$OX+Ji z!DV6-xl$YVcSwbY(@^q01e&uDbClZ6&di~JNL*&tE8N&)FZq)lYt&@`fe~zRk-<6K zp}8&Gl^w$=u0;`_lKx2g8|kaq@I)<}@Iz@b!w=BzQy1Y4=_}G-Nq->S>37$*Ofh;_ zI!043eDR)=zA61X>B~$2xkUeh61d_(dV#)X(-K4r1zql zniTjCq(7DZNcy65i^fi7XT8DFD+V1)& z8idZvidM`1_{p`Ur9=Hcls+eYTlzEU1!8v*0iQ5{?3UC@A4nfYJ5^e|uS@@4`n>dl z4O`f->?I@VCB4BYgDd?%mcA(cTj{T)XKeyRw85rbd_&qvKl6)23!}F*vc?0p@HwH& z4J(iJI^m-?^#<(&!PbeH9$Tg&t_NLw8J0c4&XtXnE3u0KJI}65zmojFB>e~JTXfYI zOn}X#13I~L=}SoAJVbaXy(ayc^lj-gHWMU(pq#yMF8xaS*p1hT^cCrEq(76M#&3kF ztcVqpIe1n2Y23LNO0;$qNDdfYfLn=Bq#Y<=X2akrw#Y4V{vEh=O;QpVTo&%@NIJ3V zR%77LmcvydDV&Z*KvU^O>CdD;m0lorX>A(8O$)_Kh{jisFqdAF{txL-n4y(JA*e9j z6=D6e(hKaWI@L5@lKx2gx^ygkY>KiYtjXyAnG7nhgHL5izL%1s_Er%wp7}t-sE;g= zc}t{n1D9H3>Svxl$V%%If}JH!@08~9#6&%})1Z5wB!$mNe>T9Qn1h*Fm)pMcT>c5FT477lvR`6(0{M(q@h$`2f>Q-^Feg(zGiRQ`vFf4k!3}yI zqUb$f_`(KqR)j7i>2K1)=cTVnUzTo?{vBf1Dg)zNCf0UZko5nLrGEo>EkGJ)z*N$S z^s4lN@oO}bUXs3z<(r_S3#)xs6jnEk{>^(1pk)Tz#!O%IqgHlCSD>Q3@H5C!mq27E z#-E5M2`1fv14I@4heO^w8Mh;2!#5~kJDPu`ApA4ZH>KCm>}aP+l8iczDED2t^>p#8 z(wC)IIo4v$%X=Zd#*BBMST5a^{y_S3=@r7<0+U>`Ldk&~P+uEj=n+xYyiRV}X^;nE zIfB@h#rsF#*x2a@HA(e`f0;7t;Y$7yezqpFf{-cw;TaQ|0(lyd|B3WP=^N6Epjg2; zjqzv&VM(u*A(M`z&q`mGo`-o<*&vx5i;*7ZY3Zr3At|(aMf#%jBA|5!+NUj3!dgZ3Ad>J2(?Bo02pZYhLb}D7y3VC3xo!CH z;UvJ&|1U{jm!4y+M}Q~`a?-i(VP!ont!})DU;%eutDXuH?UzUX{Ma z&a3f9QTtCyxx&x&S{S=fI#^me9DH~XXLTzzW$i=a=(B(X5^InvH!!#S)k z%*83X-r5+o!Ya!nP2fxES?OuE{ti#u8k;?*R(0ky)D$c{%lu-IEf!#n8K`>8(bkol z-XrN{>3JgU=Wu3oeyc^)sl)NcXe)UX5hG*{>lf6Bc!h=Aa9L7tYI77@%4$2T$I>i( zkHR&<*0j9s4mCWqxyF)iO1CY(>^8s+=~X6BWM)sd=1EFpqDF*qCI9E7r)-Rv$P{;& zj%ZJh?EGrw=P42SKIh0Sj)jHPfB*FI#Yl+`{_d0?dsA+2X4<=z2l;TM(jjp=`!k+R=n?CVj%L zm>Z>zQXh+Wzk)pCT&WH&s6zImj4rMk4DdYOqyl48%%%^wPw|MW3^pJ(lrImDs zdw3unN;k2MT^5|#n2tHsy$OF_+UZJXEa3=*l`}7NW+>i-I<|0zCBIl9?>v<6I8Zfv z+swLZb3(aZA3`{Qn?`lo0Sc^0+YgZ3wG8ecT%AF-zis~T4D>cR_Va-cug%fd3=v4% z=?se(vre{_*nZROmu5>+ zRp^d%)347>Y*8+V9BYdT6jov$dH&20svM}gVftMT{vM~ti zv5}^y#B`U&N*yZC#8jxVj?Sp0cyd{~6oLag1wXBsi<9%V5jrg#!fnlt?Xc2elGH!J zslG0N6A$)?-lIT0UVb;Msa@$WP!#Mm)M#Q znphCs?cWRsc7uqkL{A>Gat%OK%_Daghlb~v_!%NB%wlHb$0z$4WW)+dR1vRchULUg z%LWdtHL)$N)>N5BhmkH&>fOK$hR!CZ5p~ZD21H@KbR)vAsWL?>8cf)o>2+x;*xUK7 z^oRs6LWkCLq2fUH2R5d^)1O=9MGaq?N_TJxLJ=+ojd@4dOa{*% z1VCr6mh2(VeB~%aZ~!-G-vsbwhAg&d!nF);StwPZrKM5!CnnUnCHvhc$K2oKbq)YO zQhd3i88#}5LkGB~QXR>ELriyw0o)B-zZ>w4bra;4&=pomSy`8V#{@3vUM(znm6?tl zu6Aqgy_W)?<9!k&-H-vz;bMgk-|ai#a9|A=EC_sM$Jaz{Yiqg6mHspqD>STs@Q

UUH2Ox(OGl zA}#KskJu`OwM3tzIa`ZWDmLHvF31^VbzsN9#+6&*titG6-TvL6zf%I3DbY*8*4o*4 z%O)fjoWqjeD%mhW*PVclK7t87vVSx7UzB@fJ}4s|Xsp{e3ndROlYpHc8RI@Wd+PlF zw&;%qm`Epvxz|+lE8em3^uNmhD!juj(yOTE?JR}avio|k9uA-+PfH4Lj7kS8xqxBV zQ@|wL`w)Q^W@XkEQ?JP##9;mk}#&dE&& z(z)~qv>@WU5e}d)L0H+w^00WosUaVQ=W$U%4u)366QzgTA-wS6z4R>r!%M@Kg2s1% z)2T2_`92FMGilf6Hfg={P>5h_;bzqP1DU8mR2F#R-ZYIAaa`cT8&)Wz>Uu#cli4kM|5n1--J)s{%nF;+ zT39~(WR@>f7If73;3LM2w^onrE}pXBOOoYtY+;l%B2gQXNE(_1!wTOZO1Lx#yV#E& zdc71gLy{JTqGYUXfvVP?f@S>|h(Cmd9LKe?zCn6*abb-+JwAgz?nsA(>mR9gAF}y|s15CsAG4e2kGL$oiTcyLZQy zs;t#>JwVYau9sDMD=S&Ge4!Z7kP9|B-OCd$(v<34yw?!>KE79vZRT*Z&rHXSz2&IE1q3&v5I>zp(H< zncInFU8gjJyi6y&Rb8KQhYVIemA${)Yd+Tr!V^eC{axgC<*QKMpe%|XaY zqslGVf*S@S1P6VXgs%|(2^tj(85GKV;Ci|ua4h!?cNJGsUclsw&6Hcm^wXTfP(#1f@HQvMA{ILmzeS3t6T*&52c6F9eyrI z#KEl;id78uU|6QoJnfvHFU;Z;HtTcvnP4XWP zEspxN=HAwuVLGC@}FWQvkoYA$J9laRs6j8@Y#q)z*W&BiLN@eN&{;Yu|$L8zF++ z0*wse`(lr1PAuEf#;EPUk0J?JvnZ?mBqC=&MAlcZ{^vwITa>!j&s!Q#s5QU8VwQJg z1DorfKO^XW?&>=lYN(#}!xxE0%|F?da8IyI1CZXsi4QBSz+Sq*R`=<*H+ey z!1J7U?3JQ3UNuq0Zr~E5FDSm*HG{WC?O%c1M7&v(xaBP5!FAI#8w1vIFoz`6D2R(_ zY*P>50x*fRhlFjscOev3Y{ zqGOmWP#$Li&I!T~XbO8sGkB+bG6&dV{avpLd+c5ZQ7ma`y%*~~PLgrK7J02agUlfE z++yHV{Al_d(Rb^A)oEUB(+^{v0z%vJI!UxINO$&!$qg1_VkR~+=$he6lRkH4|>I;EMx-xQ3iPS_Wpu zX7BC(%vf#(PAlNTt#MNWLm&46IS*1;u-QPL&dmB`128+cf1+s9z@9xMER;G$ z*vSu>_i!IB&%)kF{cebnPef1~Yr1V&_LemYq^kGwQS4p7S#k z@#dP?WMPb$ks49}d0QkL>KHZomgp_w(;f;E&7Y!a^qi8;l;4^e3e>A%$auR0o?Au? zIOWSd68paVFR>O!)WCAXKLd-({pBOBk9ar2y|Dgb3;HTvAt(2whjfy+U`8|Ru8REP zgc{}ugjis|0<^Cz+1i1{Wd_!CFCV7A;vq8xS&|mDP_uTvel0Q80k66uDqTo-42=*c z%60PZ!N?gm-5R1$S#~s`%Q=->wu85C@X=f9unnyB+Vb}!>83Z67S@Iu&N90HDr(0s zP~V(r@6(dSrdPO-&WIp4=000=pf)?*<TlD31FFYynL@O2A2Pa!u@U^3YH z0|Rsk{kUS05^ZaV4G7Uhp?qiqYwlmd*q!&KxEk;PBH3wedRb;d&FrArDN~F``bZ)I zj?j2Y`g;L{I8uO6KX+bQ5*b}g6PqE|qA$GwB4Yx|jtp>ZEm-2=BMu%K&=wNGeJkCS zMrtc;M^8pG=!!zwj4&kGibF2gf$cv7y|BjuD(s(g+T{z&X^wc(Y+rv|izs?9a!tjd|?#jzMe0CHpO$7ofWAd-mzPLqDEx2HWdUN<>_1>0z2 z?xwP0c^eBbAwbD6BWwCnGFr6N_Z!{y1-Z`!E^-I)JVr@e?C~0g58qlYS{es8?C#pq z)ZzO_g0!%zT_j{JT&e(RhaczBA3`8)8J}2T084AvAAV}{`^i>4mLAbmv$kffmHDZw z$4irU9x!&O=4p?ulq*Y-x<;hTpo$|4B4z1tG_cp!jx!?vte+(3XkJBcY`E@`Fd#Cp z854*pO1 zYjd|P_WaxpXbAx}5oxCAZ?O5J@gtfe*82{)fgaV^1e0KpMAg5A3XypXHZ#Ix5XmmV zL``aO@H45}p-ld%n?zL~X=DvG6nA$R%i=7F|VC}Lq^4^!co!pKk5 zaPwrrc0nYvGK|R)gHHa9!R=9f+%RK4<1r(Z>B8?B^pmvtmgrJTM{30u*g(@8i6JU# zC70&LYsy`o#0IoD(#Nnd-G~fq3Ocx@ntwDH1XdGnuB_5KF-}9e|6IB=Xv8c1XHKP$ zIc_b(;-_7CLVZX;COr?_Sk+f(@_+)frXag=7&EgBXA5Q&bd(M7^ci0}8oYh{O2C*i z2F|9|o~R=R?lfh(upJ|6Sic^E335cu)$k9TVuvTjIe-2`gcPAYL)dMyvV=8A9WTJ>9%5|BqURbY)t_xgYvK1Rri9?omBmzwWxuk9C018Jk zS{V*3#4{Usztm=bWN=na=2aT?uVKS>5p->;n*%{TvZ;x=xuKdAr}qg6r$7g`7_dv} zm`l7wxgNAS4UMrJ*i?oo`G zpcluEf<%q5Mixy!o;d&rJF;l0XFawC?yuOo!d65Oe=UHX5hIs|IM)yp3tCU7*pmoJ zu71)%ikdT{PWY9=Q2A&C>s~%8L%{-^8Jl6p3!Yjh?J$67$&pIrH^GJm3#Q-(!z3)U zV2XWjt(UXqozHQ-JqRuU@10djD@*(?I6td2veIxBqCRzpOlN74RN&8bUnpG+u;siM zaiUkrj;wi>geKe_+&K!z86aE6c5JM6+*&EQ6MpIcc1$03d@Bl)g}K#Li+G4+~A8bZjz#%+_Ja+$<>?+kdo}nz)61b%f zmK%M-V|VA;VDJh(7c+{JcFbrP9vaYiPT9bvKv7o)ga%1b11-qRRy8)M!b z4?2ii!@47M6xW{{!+JKFzQij`F~A$r#1`EBz{V#Gk?eqXgFbMHQ41~DSXwX{L{C%c zDd{Op>?P^{(wd0o{O5S|P)awXm!xNZTau`tgzwC@ zcQ>Cw01u2J(3pmVI994>ym~=!@W!S@-A=16tgXL z7r|0+1tW`(2?#CK`}8dnA?8sq_g48?^flEHkuZ&FkF&(l0Daif_tW5n4l7as+k+3gI-y z76tEZ!5#jsHSnr||J0gB-6FpmLU5(Ir3r>8NL^8wi8{l=JxNP&lXl)Iwl>V*hW70A z6o+2@6zIYYh{kO{Ox|b-y4fFxUcufknP_xGgq95g&`t~${LUSa*beQR@<>Z-OgoECqbR|jLFmtI z)0QL`>3@@jy6rG*Tzp}T-=!f^!7-@zPjm23&O%M8bY`e@^8GWYE#1S5FWr#tNXOX2 zwjV(Sarf!{E}+aPimkQbZgA_7yzcs7*N6cv`Js?sj%~bc07FDPy_b)Kc|sW_$h1?G ze$}Tsb^v68Dw3hTX(QntbIxIo!i;0kwUuV)1ON>f{-JRx!VUD;*eQZa{dX&>6rO4B zfYu%lhH=$KkIGoi%o-LW*vdgfRs#nR;eU#IjyN1vL`4VOrnaK(um&Rxa1covn=BAz zxdsn^6+Pu_(LkrftIFZTHLPu0k#Lrv<2w!qPdsz>)~O$ES`qqXRqEd(Fi=WW1v@tH)x@ ziIN&@-iGr3!mMyGl6vV+=K!`qUlr`l2@1Ht?$#{H8duR>oQOTgoOd!qj0+5KBI+)z zM3ISyzArtBbBb+pO=kZ70Iu`pXbSIPLJ5qyCVYzq=8T&1>0lsffv;)-zza*Bwm|}5te$0U8r$M*Z*Xe(4zG`IyuUft^W9L=O=bgI#&y1ng*bdo1 zFE6dgx!FfyTkBgcQOe315lb8I(5X=80M4X$q<5s3jd~cArj#>Y@=Cd*APGJGKzd(# zUAkeczRVb#Ye-hL!2o|_5VZF|z$Hc)` zBD|2^k$xl<((}@Sl12{h-LizGfhwJSd_(%7^f~D>dl>k!&1Xz|do?D0axsvYT~3{GW~%(HkmG#|H6dMwkjJ$RUB@5XFixSXk-K?@4b;pWzg^;NrDN>oJ;> zUk3q&0|@ASLmoB<*w4WZg2B zVCf6+bLnrTThi;K4O`sL)JAdcq{q%AOxxm3>F=dyr7v5JE3n=zEQ&%plODu0!Q`pm zl>R|_R{Dlv?V@T1A%oB(8toxrMEfrt7VaeuRNo^*^C}%lo4ItlAf|dBZGjp!-vM>3=`>w zB(%2ltA#nzk~T;9+8H26<6CQ|o3oa;C@(Co0}qqVyd!;=F4qOyb!I(Y6E<@q{B`nQ zNpDGiN8Res;(?m(ww(1#8R*&0Xy3KW>(hM9%yQ3qUE9vZcZ45Pmzq1i}x zrH6wIxTI2jimwfeJ2C!IMa^Ow_)$fIePmP4HvD4E|3;BvzOVmbLjEh5;SDTyVYHhP z6H<9QR&u~U0tWvkN20dIT`94Ik2rL<@|Ax}`bX(0>9g1j0X&rO$upwY^U;Bzf#S@q zM=Cf=HB*3!(IY*h*qgp_Z0(dYs{~F!s*n69Vyk3kJP&9K2r7>Ft=<;sex{P{N#BL3 z{G62qYbbSRq_FQvACfgm(oXtV`tJ}mUyz;xM%@~UlOo^`r1zw=FaSI0uJm0N{dw#D zlli_K^Wg5r)*HyMwcKOJVkC4b7ACiy(5+z&zMMdYO+U$vZiWbadJsSXEp|>PW&HV$ zP%P(Pjz)`MA^(4t?no2qi+wGrjMDg#^oje>5aIWw2}G`A_IH8NKV>tfSm|+yFm-_Z zO!_PaDEWc)0mKF+3GzUmJ0d$Wze1yiglRlFEJHGyTJk~+KhkYVFmfn%xV)%Sdg(LTQ ziijgOe#EJ?lHQiya{0&NCE`Hff6p*iJi|^sl71rnOnS60{|4~k9MbfRD+^-c2AJ;A zz=j2c_rP>!VwqVq2b{9CdKK7~TWzVEo!R_m^*OBL5p*SeBK=5Oz@<6h=MF4@xs=YN zx1|r<3T&lcN&kQWdFF6u7*sl}ze@@(54x|_?z?5T_rV$w>?m6X&^|hlQ z*3vJf|0eCEm!(@24ok)&q~heI^dsplUx0M+yV8G>K9OFRUV`%zoaR<~DBYEQA^p^e z7|BGuFZ~zkvGkhs9LFoR#G6yr=}qZp?mglsf(1|*g&>0S8JTwpI&$U{Z7amm0H)XM zm<6@335d?AUo4V^0s34UU)l#kjNlJ0!gr+qD4j}or8{OQYpWi7DE(Oa*!OVDNr>N- zB>fUFKf{LY(8P1;htiK-hm-t2l>T0tNuPmBQIdEifG?Hx-k09kFT-TWJVWm;tr{29 z&xV!SI1-r_`GItS&eT*Ag0UCKL%}_I)=L)h1N*P30sN&c8)gn#8*Vsw zHRCL%kiuQ*N77PyDBYHX-*372f%K8|3+ZRRokIS9C!IFgYyj5 z(tG3;52TljFx1!~SofqKNk7?_e`b~9(k5?B0Z|e_b_kIuVxVsGlvLa%u-Mj=OBTju zss>E*mdN-K>HEa`M`tX-#L|++m!KT1K}ts-60b$Vu(P{Y(nCfw_Z0A4dQ-YD{j<&X zk90I6K6yMi)-o=B!;+AoH0`XN;aqw|3OxKcjkTp*v6hxtl{31t@&Sh}BSxA~3^^l+ zZusd8={zKq?qnA9?aZK>DBlKzumwzfu+KsXDV#}fNROl+13G6cT^P^{>ArM-z=a7j zl1zNic7m5s5dq$lJ{gR~OuzaIPLWst5Ra{OLY^~;YGea%Ej7w4~wgA zdd-RSS`yit0P7{2v1dSX4AsucB{fm!%#40Z__rOkLFx5=Aw7`(yYP`A-xZldboF4Vak7RRR2Q?(YiljD!~%^{{$zS58PmB z$f4(3XC{KuLfYlPdyD4R!ltcdm>ti?vunpe`yRODD~_#trdN+v(^?9I_bkKr6>mQE z;teO7$1WdSKYp<+Sd7f|QhJOs=GcZg1n*r*xUpgBQs#Rdt6NTcY zbxh`zqN32bA{y=G&!-T*CiwUj^9ggnr{GjiuM2|2>Q$fm!oQ~ce}hl6w=;i(#S>d3 zgA-m^_*;QzF0kI68BQp|(|sRsX>`Hp^F8SEg}U4B2=iBL?v)K!jGQ?Mb(}}a?sN7}Z3cW!4J5abX}vZ87eVO0 zdRctt;eYTAE?9zQe+Z059l*cv=Ql7Qztbmp4r^e@lJbizWQ*`k9)Sip8f=w`m9LTv zgTtmld=@({XIgPEF6Y*neJusxf8)=uyLcHi_hSN=6||v(obs_Wlb*HWT&KVl5Q2{Y zs$e@@LSmnrW9~AD9H_0KIMgybU`krHfK+mA2J|l=h5x1N)PmCnY`KG-BRj`CDkNPT zoMDi*gx`_Dt?;coVw)Niu3LEl96Xlp;2DB)5%tXqd$hhTiTPLkaUxGNuC!)$T8rI> zFn)#nf-1CSY}V4oBMM92m9bi!c4al749l17T+9X-UaWsr#^}=*AbK^o_}&@eM~ts1 zjpf^sNuyG-G{Vpp&%S}mf1B0n8IWX|PNds560F1xwX}YgShdURFzA21pVquw1`8y( zD2J3^-SqTmpgZf1Zvje=;KgKp=(HNJxY~XZKC=iov!1aqRh!=>zxv zg}bXUEfdnJ?I6+40Dlfld2{2RpyqPqD!Z{@1jHEWVj+hHOySrCG3#VE`L<-)OY^@^esD2lK?SI=(Fbsn7 zy)^b^Y1G$|L^J}V2o_Tv?wrXAfqteU>Lc%C*fB2b*jOybRCyvhj!*W48oD6#Dg78Kl&bCbq2C_EsELl2VvvHZjnmrTNw%z8$+W9zOdLW$TZSO4*bgOO zVEWXo!4SaypCSMMDW8Iz^OFAz+AH_{&>kZTfg(1lN=TZKQ{+3wcdlm#%o@E%F&m}`QLm`03PVI*Ik@j;5#V62l_iM z)!hE^d!|psm*Z>D(BzVu2!1JDNYBFQ4(A#py&Atb5Q50aVjxD`Ina%D7@*9QzJvk2 zqX|4FZiN9_{W>M?psEbGI9Wf+9v5M+z-mj-8o<&#VeyJF7D~Ife#0v>l=gzXo}<7U z*xV(}j3-_P52zw_1IV(58?^S+a0u)DB6=9KB~g=Xphq-*oZ^PtCvWLc;X%`Iq~=d5 zBV>L2WFS9%Dv`%mesE=Gu5RQL%e%z^hMrHDGFn3`pBvcFsm&qBi3M`g80^r(tvq)w zTj*onAAnyO?i}3z!ibqq`dp{NdNK=0eX}(%UTt5CuU$N6{0>6bC#`(UZpkQsEjj!y zDuG*u!JT;%k1X*mS(qRkMF-QA3~tCF&*3H(G{6V#F$jyT4an|U^q~XDtp#OfCqf68 zpoR^94eveD+u*x zP??AASn1+h(rtQ9Y7(E!MpZqM?t3FlyhvKS8-Vr`)5k5ef$L09{ty+IjZ&x_0FMnku--L3_yYdRwE;*PXmP>>+fcr=ffBXs z6Y1mY^#3+=rsxBx2=2sTVGpj8|841(9hg>I`%Nxg&;vfQdAyY+*gh-ckw;reW6h3j zn1N*dZ}E_sjgk%4dV)|N_RkyZ@Vg*Rgv`{pDnlI~amC?bR!<1wy<8E0`F znJ%@wNDC_em@PbV@l(=MoL^rXt9WNr&yV*f5d_G4j&ZMTbYV?j+>QZPkN2lZWTh?wAc(V>&;Pej)uc zjj289m3DS+62ndD85_w}@cdfF=et2O^wHFG&AJ`a|g%SR5@2aml2Y zpGp5Hec%_T#d}Hmn)C(fIkUABYu@?@`F}Xt3sW-hum%j>+QEWF|A6da99GZ^6D>#u zS=ZcZ`VH!k4;;vpASghJoi!b&)j5;yKEe7Y8}N$sP3iN}^T1B&X&*?f^t7~-{z>}J z(pv-lKa^gRz9GFXJq08l0Ard>_(=MJ^!E(c=;{Bf(jQA-WDY{Y0#0qVdzu*W57K-4 z^3QGJ+LWq$Yif6Z4}M4urn~us&}%`;Q1%1j+8R0vpM{kf?>XYwOFPpf?vZ5vr0_qq z@Eg(}O1F)}dq8{aJ?R7K=hAni_gsW&^Smm3UHXi4lc@=78}XXjxPLDF)4u&#NH0r& zEPYLS0oY{%r~e)s{!Qt>NNGOD>)&|2Taqpzpr3_~>QcIsPYwH9I?#NhC;)$Y1 zNFm0p&7i;FV9A9cb`4i9(e|rP#mth-hyXK!#S&w-B`Kbc?tzRcDE~(Kmh@G|LgtoT z769YVOOK>af=iQe3e_J<|4#ZN=>*}GHpA?pbSB;80MeZ=-iy-LrEf@|lU}j*^VAsp z33H*JB8VCebE|DYK-7W75=@DsGBkO&pT2ks-6G@?2OxAi%{n~rK8%V?Cp^oJFs-rX zvjt6}#i;z}(sR<+rGF=VnTo}PhfnYP6gc^>q>l&s|FZOF(qF=Ly$vgNXP8MEckzH@ zZCaQApGyB;`Yd4j99^u)ChkkO$>4@WvKHlw#5us98iuDR*FHpQ-Of3%c+pY*FmzjM zp6oE%4E3z(B3#?ia#MCkZX0eK#Y*yjQTj9KZ=|m=1FayMpAb(yl)i8J-wx#eH`1#> z;xz$e!Sxr?^U^8x*{j7%{N!&+e<^)ddKT;>We_Ru$)u;H7o@ws^8WC*7-v2qC4QI z5iX-($W$8m^_292bc7~pB0VSlx%3~TKcHJG?YKnZp5v2K+V;Bi1L;A!xR7p2Uz7e% z=^Nm_x%tc1?B=2Lf%Fqj-R`~-_--^#3mdQ`EtpY_nS)gH$W$dZe8XU+7NaGcXwKR1 zQQy51Qf^hB#(uFNVUFRZJra9-CeqW=H>JOk{!n^RDx`D$uR zrDyR{L4Pk8OPc1iJ@4|LO3z7uEd8nU1#sJdvzDB4Q%i@WxIc=SA`|IZ>08p@NPhtS z(DJ)e>mgrCFG?>-|13S;H=_YTME7(IQpyQ$W~fu+Wle_mjr{ zoA9MWC}07hY^X+#r(em0{HgSp(&w0gSF&c}hR4#8bS6Dz7~9pweu!1Q)#{WzwtCKZWIU{hu%laf+#Eahf@=aE^e7 z_D3)|8=GY&YwKu%^e_A6iL1R)^1YfotUzWd3BCaoG6n%95Bj|HIq4aO8n39V6$W)r zIWF@l*7GWVeI4s_(-4yM-e*P~+mYRraq$B1;Wg6c20FuriKfSx?jufP9d6(qGmD?5 z356d^4;_Y!9@wa9`2aaDix!-h)GCdfi3{ zWxz2rGPtDA$I_ix`F#g)!9j8b8XFUJwNh--}$*T8uV6}tJT{0p{tTK}gxM0myXH#n0yeT|(DZ4p?5kOKLQ zsA^5_a<-p=6r+db5=;a?en=sTFC9r=LWPfrpi4Ufw z7I9W!7e{tb_~St@UWyCs zJF?ap80}RC7N$P|zEjL)Fp`2mePFzvPIy87Su+kQ=M{D&ZV3^-(AX#0@Ry~Rr9&$h zgdi$>dvNjQVZ)R0dWobs8Ju&erEwvn9ZJM=kx*aRpVy@?Nl!~9KA^HAe=_ozKvc$S zcu^0WVlzq`Gu9XnW94T8W$db{bReDJoC^kOL{Cwg`}x30D6v~wtWs{W*-&_{tQgf( z{w@QUOLwGa!B;Q)Sb3nj6C}}Lz$rjHwu7h|>xxZT_!_01nlfEod>}n1y~tT&yMEQl z+o>AsdmbuK$S_dKBP`hhr8#Ha5B*d`8HJ)I*btQQrR8s}f$<#xy@rx7HFos{Ax|RD zWbA?UXc00)Efe`9gFv~n;#$W2O3EX1>b?|g8&_>S~~^c>Em zAp(yLC~WZQXapC^bU?&#Dt%<95VbZQKlXDSI=p3~?K$35L? zWiS%%N)Ja{0ZKc}vNS@sH|*@_@SF@9c}o0C(`Rw)u(dOKFvFO3KC~kCn_8+IC|0!L zdJ%o36~iHj3)Am4y)4JpWct|1;T?dyW$aRbhmD~_6^~F13`k>JX<{cyghrM;el={699h(MJi(SfsoaN7(AnBToB$Q$jn>j9(%k`pw`R=`h|!|^c}GksQ^9c( zdyRdlf#BA>{Grh}7rbmS*uOj6K#jm=fY!-xTmYJ94&g=w+At^k5oL^B{|wLo;zrV! z*gp$YaKnjohwR9i^^^CC2^l{02;xPY3rG zt!!Z`U9*Qv^F77j=`$kDUH@Ebx2J7Ck?X?mh&oqL)TXv?Cu9~ePyV{9k%q= z%_VQtZGfn(9J0T2);UoT9@_kvY%q%{E{@)&xutAdRIN5>;&`-%(U32!d6B6xDq4%f zV$x9;gylT*1qGziq?i-Fr9PH=K!%J!X$n#Fl6XB-t?f62jQD^s1MxT$JKHfg#$@CQ zofM-qpDJU=#Op!q6o##`GybkVHHC&HVWgx}>A(g=6)4!nKyrgEFtGsi)U=>L?MqVG z-ZYl*ZwEX}kZ+E8_!S0l?Z<(|^Phr$Mjl(4XbzCv*(imq1qZn7AAWXe7t9SpZ*8%z z-aI6n+vvO%eA3FW(3rB?*`Fn#KorAfxa4E&4_N?kb)Q}zKw*`xwYAihtm>9CA$nOz z5KboCH;h39YoRW+0bp$?$QVE zyo9il^*LaPdV5J0m_-UaBVUOotJwD`1~_NaCjjztbOX0U@Jn+e8BcU7ePXP*#|CO$ zy?fhHfw- zrc>w>Ga{Z8fd;kgz%?)9`E$^w8w3q0q-n4uGq9*(4JypQ5ewN;^^e>+H?q`jpTn3N zS}KDLTJG84#vWtmYL{G^T5wysgO$6q4xkAZ@^XLk`2MYUbNBLVOO8X$szDwYfo8a< zWb03G6g!N3wwG34J^z+uEut)e>#U8y9az|qKXu7O=f_xtEkmsuxKwk>dx!Q$-A_cI zQKKOhW_N92WDAzm#+w(UR!d`T$Wo>jEPA+kdl1kB)u}lXBJhtIqaB(1NCXmKW*FDZ zUT6ri;sw=+<*1;=*XAdz45MAK0giuytP#$e>eveU(NCrWQXw!@7?H+TeV4Pwgi zEmxl@kl4anC%2ZV)RyQRUw5>6L*Y9JfT1%5C;25l-7%V053UcQY{f2}8l^PQt)NwR z-g^h%JN~3VDFp><%AYi&`WvB?Q2r^v>Y8fW(i-{l!6S$V+fo2J=K0ftE%E-32U{RR z?JoHDg*D-{^xe-WWlr~1ATxtlSOzmC{S^-qEQg-{G+3EP&E~X%gccTN)h4b2NytfO zZ%H?CGzAsN(guRY&uRY^B5;oOm(agRNJ|3!V~>Li9MzV)T~OGW;fakApRiUM134yKSkb7P8K-M==^9vUsG3Qd3iuAGM3T20}W{c3BY zFViPgXg(U?k3CI_K8s@SMCVJ+NjzbDbSYRkgbEH|N$i~QZWn-wGb2fL`|c)nm?J8s z!2)V5M9D|3zswBZ=TP+)sVsrMwvsy2a*Q)ux&rNs zASl-JXpp@61{Rt%0qxouktH_{V=t|0jo%ko;lu!NS$QTp7JB&0#1=*t!Oww^CfI-z ztI7;N6?K~c5105ekyoc$>BW8uAHFU!&^eLi)->wa(%epMcdStL+#F? zx5<2E)cYs4{uOc^UVlWC5vaJMQxoZg=7?bvP4Xepi?pHG9bFfJ9OwJ`-?ED7k(QQz z=l!!a3-hF%8x1PPynpg~$<~+v4M$UHZ}VgplSG2JGdMcPsLiPTiMT8M*3yEe!bo)! zGvGZoNQXmGTZ<)K-ik0)DbMDMLXkx!=7}{AFPh8 z?HiGJh^+HIB9rnZH?sThwdO5K}&4uI|OT6;Y07!irWh8_V4f~q!2D7|s(zGyoGwXudlG)8jmpA)K zNNl*J^&&>-8D-%m4Pql@$c^+sdWI9}YxK9U+Iw)3x*$HIwJ1t=7sx4ks3(>O3_lIF z<=6tXsPtCmbc+2Lx(o%{ap3yPG{J9Cml@QGNAPm8QDZP==^J413PwW&OSOS5iq#8F zX&8ertf75dN^Iji==5MuOD9I6UmHK%yW}fZL-$@gV}9iH-&E#%Gn?tzhV}<~eqf#1 zvI&f5rknKu{P3qJ?O)bb&qxH1f(qC$4{B{UY)RoGi;fq}t&}*lK^u_8hV2q`keaj9 zMLIiP11N1Er_;v^JA@}Q%(yc^B|T#;K}Ya;ZDHUpaWNUdQJ)lt`N4v6p z;fuT^S?{N$jOtNhP~>d0HHl^qg_?kqZh-TGza&fU$^nxzBG882zC~w(QeEPt>+A4O z*?w%;Hfu=7Q+tXVL|d2pY>Mbb3(Hkl;t^H7OV7h1#}9p)Q-+E~3V*z_o{%Sf3Y*5d zwy3HhMLBRmWX8GLuz40_z3E@hiJpc9tgsM%t4KmawJsTFKN)1u!y@pEIQWvkZlO^= z1l0~+N0)_o5Erz831LN!OK7dhHV*~bk}Xy?xG$bEBcMEIGQv3emchopfvkkCS6K4a zY;d$VbUgIOz%~sBOQz?)paLD}#Cq`l87+b*#26R2)Fp+V$nm!W100`8p9nLx%|{!GIC6g^%A+ew>-n4Ulp@u;CRXi3+_5zgw_}=a3Xv`w?_&A)73ntpVcBt^7T# z!)CyuOOU$_VOC*9mnlBvVh|HXi6f_pVrJ=hX%;$wuyGhfXQbz#UT3Umtjh{Jdm`QT ze%A0Tb3(T*&wt704MoZ)mJJU0VjoMZ zsWHpayd1L=8W{znUKi-ZW`vZs1#;u;)ZDYM_v)G9Thenjd~AcM2o_I}%!d%}7Gl|i zW}U)v!U-FB0UKpVAWaH;Uy`(B5L!^0BVsCOGu!I}NP@lr?I4i2DTb*c5sl{ZOP|D4 zusJW0-p=Y+wbkc_t;nspshU(ZQkaNr4zW9%{V`*0j89}@7 z-VG(|1MB|YV!fl~ra@A=t_HB6moGAy6_MPA%qRp@xf|Ri$br#b8XjXoIXsbVdL0pAfHoG;B*k7B0jsege%&+F z_QY9VxH-8@dai=(Xow;DQ^>&Z(hd?ouuyO^fbnwDhg%kVZU{w|$V~{jhe8FgazXj@ z!06u#%=HRSTJA@nUf^pjOr07Tc0xN!HRy&2gF2;RF|~m5oClnf-<@DsuDoBG@N&xV znDD+0zup)ewIom)v7U3jI-v_DXTh8$Q0)8g$RJaU9mvI$4b%X;<%9BH3p^F+d2YM! z5{^?4ldolecCwqix}0T2y-it~TFi66O%Jqd*tTua-wU zr}lX*jWwzi7Z$|ksBO5b*qyyfNM?RN2(TMt4s9WD9ZeQ^E+T7bMS#@WBBg>TTijrD?MhAt#H=8^Qt{xI#R4o#_IZ;S&K8TC!+^g0-^wIgRH;9*C$ zlcvqqTxC~L#ZUZIa9v*B$9sqIX-f5*y?fk*(0E z&W!mZY+3CfUPouj3On0t!;O~@VKn^K4(8g@?9}wS=s_MA5nE0l^uwM3Q ze$BoQXe{;?d*K+J5fPWWWbHqHZC%B|9BKh?8dBl)emiRjW(NT`!y*RrXlFgT9gQ{p z_p$UW@S-ezi05+tSFZ-!X;PrKxsBlxa;q&kNSbil`za&;X}R4TNH;ht8(h(BApM-! zH&$*RqF7sNw-0l(!>Y`I zz5|0E(f`c6=RIjFJ!KTWsE5@SE@v{aRb)Lmtm>A3Y3T9VfK>F*_pOBpkr3zRo0AA; z*Aa+?=4E^-0?kMzYJQ^FH{iJiq!CF)F)z$(ODfijAp>w}jc8iqPg?_JbI`Ss05n33 z9SyvBe-}k0_mEbC#$R4EI0Oig@f1h>rr3CjEX0DQcra6qsU@xgSUy7A;G{rOAHe(o`_;km&naZOV71AN17 zKLAN3mgl8srKLIjiIGTzGoB4N>`{8FiC(8@gU%km1wE`1gmKYN$eux zs344N%!6l^!lmU77DzlyheEgwD=bFb5Sa!%8ZEuUPh`O}!@AC_G1$GlEG+3H5oz}d zO$fG7U>P%ZNDn-VX&W0Vh!@o2XA}faEdm?azzrMy$l}e&>kH6@0*ohguDd=;ZW{}Q zf>#x6u?%^2VoP=LTT+SlsH7L@SWH$aim)T$Ps5PyH5R7xil(K;t3F~)m6sS2Av<>nXO5C!#LnPjp zz*$ib7)GBFg9dDp*+b?1o}Brp4{+wujz}`B$d+eGruEb?@~fxFiC%)KF*UCgY5lqM zU~ndJ7+=`{znX1(j9yn3ca3ELVW=lWP%G3W0XNARIal+!Ebj&!MaTqu6V=fW?hJ_517z)_2fHwR@X&t)_<|0DAsmY>?;IP3CBW>DSV5n0=ymy9#w`7VjSn6u=f?SyO!RXeKg&eQi1;GxXZ29>U zsvnU}G<2$r;5Q+KxQX*7TLWEl$fWCm8OUwy&KfN&3__GOGae2O^~%|(K^|Hebum5a z6fw=o&a-{_N3ta5q0MaTMO(y-nS(tDsNz?4R&~o*wup%*`-5OgY<*DG3UdiH9^;Vj z>_#ChQRq50>TC%Ru(2fM92mMoFH#auk1~JrHN}w?x?Y;~%&aOQ0@_{?bxNw5xZ!*0s3s9nF+`ATw8F(@q>J+Qr1k|oF{UTE+NQ|UyyMdp18ijt$9 zM`+Y^5Ceo3ZGj^)Q}Veb5r-tvJqE%YcNdg}ntv+P{<3eUhM$IN;0-X#xUxa$$9YnU(u0B4Z7mfPO2c?PpV=pm~Y?36@e=;?nTsQC95kA6WUhwIK)4 zoFqR-mWKlfMM&kC=U4yA0^b%?fDsJTi6|>FjU=ZVJkFM_SGpOCH;;vkI$}jZdPnY8 z_y`-n{+#+oM%P8o%1@EVY+$ed(kdZ+IG4m>bOH1|0LWd<5VUD@kvbE}g z1CI4e4`78*;Y1NKleDGCBwa8 zB8o<`xe-BPc}J}0h4jAkb7?Kz;mX!};0goDZ+IJI;thU6b?<;XN7^;zP_u{9ed)fP zzIjz>_)xkhJ!7W=Ec=mCW!;A(jYt7O1nzl>8wt{73HY=cRDi>J&-=VPnE^dQX`+X= zLmz9_W@?l8YSd#&+Ia~6H*6yoLH&Ij#y3Tc=Y}Y5aEu!_pbgxfdk`L`Hrglx;s%*l zp8vMex%9sDK1V6%^y3x=mj_2Cm_9ufitt@lW{j{jTXmRq> z!Ns+4^J?@oVvta^l>0rpF8#WcekFY%-Ii{Hpornww~1bYVAOM<7t%-4CtO(?gQPIy z7ZC8w9l)jZvGlR@GWKUeTgi@PE2J~&9{i)dB5=6rRHjL99I!NuxTmj7-sfKhJ=Er9|GM!-H*L@wt*70XG6uZjRp9n z^h@cCqg-<1G3TsXJ3!ylogYg-mcApMNG}m)L_xj42BpP3AH_g3o^4^|sfGsF5;GFg zSq&V(l<|>;vB`JT0S_=5!LI1}pvaxJcHBT~#Qz#;-2$&0F`#GCFQor2NqR-PgYf5& zf==Pb7u4FT@S*3@+tNQvQ|V;30XfAPNb z6X_-CMZ8~hLT6T_k$IJF@!po+l3u6bEJrCWu{d+Q&tsmqa{%v4A4rd-n?|au%?HMO zlhaZCk;RZ=ZC6y*Yx2V|FvURrl}SGR%ON{2XI*lur*-&IRNtPKo}m&JRF<0R$qB!H zISAZcgg=%3foU$sK#+m_m6jYuD%_cnA4)Gsx%4tdd(vky)#%6&c|4cikiH|`mR_@B z*xY=WctpxicKFR{8WC6 zM-VEt*>8nUsn+|_vGl6+S<$R}6j|-y`N2C?~Z_@$z;qre=`j1QtKcO%ow*Dp0m|}oWe6G_*dQ1BM zNvG1E+rXv>45su(t~os7GQ9U%`la+w((}wjTru(>ijy^u8G~SYpN!Iai=&$2D^}Lx zU-5IzzzkH@U=b{gTQ-<*X1ut`AdhWPn1E_ZwnyEca#`a^M+|6^|9_H>r9Xjj9lsJV zWkDzNhpzt{=>zGz(lgRhx&zQ$S$9|1!}CEBwUvG){XlvJz-G$PdJ$O%D0Y`d&~{(` zx1^gk_#z;{#zOHD8TZUskXUw>Y_tZAY}gJbwwJE%9HpO|Z1@QsB!S-?vP?S@=ZO9J zZ+M)qQu19`k5@yY_JQ0=DMIe>s5j-_K(Ic>2ca*2Qe&rzMuhQzRg9kPzr+1%oVh@E=YD}t@u=m@n{ z9M)t5PPH6vfCMDGBmKR!kS+=3&!l&xQ(iE6f}cw7Q?a_*A`hkSk>q}vi?_fo0a)Fa zek{EmyC+@zNcw?PNDrmYG8E-Z$_ZQrjQxG-C(@(q96*#YCRjWfw!FZxgbucD)^>Q( zC?duv2_e@FIZi)w81TI_Qdnn$YWO=7<2_D^iI0)?u3woPk^ckfyByzp2V;8K zKT9ehdOwri9jI{1V-KWSdKp6fnu8frNN`E6_A`%R(#n5dDx_aZFG{aSPqC0ul53@V z(hsDc`pWwf6c#Q<`Mu>8R=BiFNE|ySahyvJrDKG&wQiVTRul#>b>25vd|@%${&^fb ztkBvzAcjlO5Q;af;%(z=WZZcreI)$=E$rn{Z%QZ99cg8w#T=bX5bTenHwW@RW#>($ zuK>7b)|Mga9KRS?zAgiZJYz7Mj`=|cJ`1<`oB1?2L( z(odwf24hJp>8|u0>7Mk8^gJw*)<$YI(nIMD>7CJ;rr`z`sKkzCj8+UU(GGLLjNQ{% z$L|4)*Rqoyxciq?@N$E=12GN!rKAmXHQGoOQg|f&RGLUXhYKSD_pk)#(!0`IgQJPk z#cxPE=^e7E9emUy?CB@c&!ji^<$quLC+SRjfrf^F^o1^Oq<5t^26qnS-(kaN*y96~ zGC;A`z^m3*Zi$x)*lG&rqs6_JWEQ<<()2UYtGuOFGR1T)2@lrPBgTC|QqO;o?sGh2 z$(cyOJWAH$gHaW}Cw*6XEImixZv_@Bobj3T$-sK0JAWkoqx7-#iu8(fLt0x4-;OCK z?@RCQ%U_1XSHv?pyC@ji>2DwIBiRW8dqm^P5(`xu?^6AR?BPc%0;nl6OOGAR5e?g9 zGYEYL`hP0Dg|EA5EJ4 zmVP4Lz)nx`M;XY<+tM#Red}rfl?8Y+jQNhIpK{R>OVF`lGpNxg2z_EFA;t)cz_W_M zD9GSz2euG2?jEHc@c`5JNESH_}RCA8xJKskE6TFsE zff-vWjCP<17o+m`BIMjszM!qNc%r$v)Xo9tHnp=dkKgbt8|#QG{W!6fYobp_{hp2} zMaT8q>m0C<|EaZI?ko=s2z3S3b0~t3q*LkVb_POaacIM)UXNa!cHRfl_sOPqR>$5^ zy&7o*i(hHE=L%U|qR2B8d3s&+v!L^CV#syyntbz2J@m~DqzOjW4hZ&u0$_>?&PO@u z@kwd^FQkvGV$m3*Z%Z(AF=|3SBAQ=X^-yN{Hr53 z0{MtybpUY_Qof*9S5KDzu5YC>TeH@>S+14;MW1P&*~NOY{43jeZGZDLwtq%0uIya2 zIUw6qx(NyyVD@6a88>3kn^>L+-oG#=WM-ZC-OJ}#|9h1C*03>7$qA=qBdh(;WALfK z&pk@bOd|ECituc2hsM9uPqsggnMx_!Kk=#~JFn*llTN0>-IlUg%(5wgW`?~G5k75s z=q*})U=*Cp-hLZmjO05~m8e~wQ~p|7PjqDD*M8zA)%9Qa^o#XdT&(`eX1~$J8H|=v zI-#JGK$$bf)GVc2(&0WCt-&k==$#oIB@DoYO{(qGAV+&zn=z5FsGCre68d{Ui1egA z5b;3v#P2-m;_SK@@poLjCtaUpb|Kw|!Z9W2s3{9iteSr@fDi?n!Fg&k3t9>i+Q<7u zj45P?8qB4{l3WlmMd_veAE?4lUc66He(~B*P>)a90wr=fq>K`A^okfp7KUB@TwzCV z?pV$pM{-X2lhQJvUEEUyP!G`dThaj#*2X}+rti|O{rvV?*rFJIhd~VUDHprt%l0l9 zUfk;g5d51GHgHDxSy5NsNS{a_4HAi#Vu`E)wlpLzHyCZ$iB}UEU6_42Vs=7!ge+zaBJ$~sQd7amA0bc|vEgthqRV+&TN0C79MlN*-TEqku1 z*GB*VfB;EEK~w=?5O>S4N(aW+%(-pxslxwVrT9O6@gf7;F;Bm>n&gHP&7wnj=NNP_ zC}zm=7Jt3!XV!(CS=i}+l=*5R_!RlFlBmdu3h5zR+UPi=mVEnt*`kv3F zkFnnrDSoAR<|Jr-#uHa*X9<`UxKhk{@M=ioUvnq1ry*yWm(4BP|@F? zPm6@793wEZxg;UZD3R?fQMy{jIVnS^XxT3xMgIM8pcrwJNxvmEeJT10qXXjF8qB}m zPe$$V1|)_WsBDgTkqz^k4BTL0wSv};o@iOiFe9b4q;`wnLd5aF^^n^4R=NlHwtjLA z;9u`2V7m&gYl>U8(vrAwN?>?lV|uPWk$X4Po(sUwn256>r`mWgVP=tX0Dh4#L~CWq zOp0P)zy4K!dNqI&jSJey#0+p0rDulWceUgg0x-22W%`vw|9uRY*1Gi5XF{!Ei!%vg zXhq1HGl63A>%ZP_naJw;80*t7esXkn?e$S0oLeimOdJu7rIYP(wC#Y^8&Be^f!Qk*qb4dS>&t&ozWrnx1<0e3Z+(k2Wo%o$(u3R43zCWCD|Duv8i z#~nld%~aSU-6-XL%EhBQul>}EGhROO-I9@?Tj_-3@EStGULF)xt~Ev?LYkWCj|X-d z_-v4YRU~VLjWI}B$HV@J)yVa(wX<1Y|M$1brxa<5z6kf%@6(M}Y#zU-MpwQE0kzh; z=W&dX1Qo3a%w_wP17+iF~H_T0H2(}hBkvGWk+v{vlwSz0r!7f~w%qDC&BaB+PKhXm|H3Jk3fS}r~ z2R_IbFXSI#c}7UEwwA;3#XIZ+wc$f+jP!c6N?9`nrZJFh2P15b=1$S{d?3ZRzGa`R z`0AX&W&vBoH?MT#9F;z0g}1Qq@_r5Ga4<=JR0>muL(a?|MXPLdxYpN+&}WiF<4H0V zm+1es@;~UKkTT47#l_1&7y5EbE4idDD#=X!u2jY?r|Jg>IjiqK#}uC)S7YdaLCGCa=u3C3%}&It~ChB~|P z>usK%1Bk%kmJJULpPpLn8uHbi2fdU&l0KH62FO|W4*|`Hl1gBPt1gHjpcaNIHGQDJ z;-7jj32gsUG7;ld>6m*iXlh9&;gR%?^oI0cAjWj@E7A+nA%dFYgA$A85tGM;Z1Rcp zwDdXYd7HmF1fH@P>H>m^F(Eyrn^-ey4yAJi8$GIFht$z} z0BV%^7lfagHNnUDiI^8OZh$7z9qBWq?IP-(8i_pdWZz(ZSWo`9q}Qa^q?;J6t(nnQ zx-Y#Wy}2*KLb@TnCVfu2gT;?Mlq|$2(wmIp7?Q9XY(<73gc-=plErUnPy?7FxpV0~ zn1l;^kj!jw&xTZxybf7~^zB2+aOW7~=qS3@0hiKE=|$-|=_wHFC?K8F0rN}gJ#vzv z{=X`{Y@OK2R!pg6+?C#y-WlCIPf4#!FVjZf;x41}@ILZC8j! zsr!k{jvI+V)KN4`?%{y1bt)VKZZ@cDfyZ6aTU;WGv(Y9Lm%|ug&-aUO`4y7f%_W3K3qixxt;~Suq?nvK| zzAb%;nHG2aZ2>tA`)KT&?dX|Sp5 zKf>3AuY65j_1MDq%EpnbsqNgbGl9qDe@prU=`W?vSR`=F0rkPQdrMM(@?h`cZ%9wE zV3i@^say5GqeYzv(~y@hNPi)HOM1otP(iQ0@cfp7U?sgH{hie8w;_iQoszYsg-DE^ zDC?G}@MJKRKTyR~!@5P*H63 z=_xzOuqAvwXXwd)Hr{=w!e8X#JG@s-);Oo(^oP>_7s^^E{~OZRr2k9$BKgt`g}Wg3 zekA>!^dF_4k2V8`8;<6!h<*xV2S)hbArKqTGNG!u#htZ{!Z;&|?Ybx@to(vX%#3eL zD2dj*$UP3E9>>#@7Jgg$oTc@L77T5r`_lJN;mdvd^Vibn(SmSBb68-h;l0_H|4r=A zi(DYSF=sf}^hN3alD# zrF+sD@xpB^>0N0mJz%~>n=U?l8RYC9R1gyTv<{J}3Q|^vBXy z7^+*7M`l2}2cWtS_JN2Hbz#|X+8&$!e9u!;({A&mNe2hgQ_>5#ku_7Cl5NPbL^E#$ zD}cS`blRL5cs8-}|LO@XU@2dFS^Bp0Z>7(YP^`>dryqSJRnptihpxg$n65vSUZ-;~ z!vMvU*jJ=O={@P)K#?+ODZMOxQ~IX#Dw{uYqKbCJl~hWot#iaGow5xy>cOZp4xHL2qG zrz1P$VM~7cw)8>Hn1eTDlF}a>*d+fS{j~G6u?A#>I!yXQjWC{!)6H_Ng2o za}E{Zk(5b4^#Ewt2e5_m?A6>7HJnh{idjHcpBWuW=S(u1U~2@h8$-(G_FGqX*wFpC zF?_qicSi+dcik-67S+hX>ZM?SEav{{**bQ8|itX=>_L1r4V;3y&@e;cU_G#0MO`pIW@$2 zXM@?!tf!*q12V(x8ms*Vb0SKVna7cF=LJ0m4W3{EVwp)NWU<-6d&l$7q*tVGOMfIi z4flO%?FFieC|XyCDy`8XrX!eYNQj@34ghtp7Vod6f6F-T9aV~Bqk^Odd{%l% zI+i|3BA80gNne+~z&)eyKVx;5Y@s8Yk<^ZF1;d7q!hxkbzb1V}dR7u~>5PK* zO*(p_N`5k`LjmJAIDi;X@<^&US8Gjh+yOB`gS#!gXl;A(f&$09c>8niSeMAl71_wBl zW>1{mbCqemB)u*@C*8L4Wz8$Bv4_Xf9ppEjM3!cQTYh3`O<^^;Ztfy1z-&^Qdx0!3 zd613u^A0imni{|w7jtamJ04)hX9#j)A#SJtC(;+CZ%Qv3Rc?yfWCru4B41|)Qq}o& z>5rw)V{zxw(~@l2B1F%;2ADR_7o|UyULx^SfAB)W&pwv!vXRTtEnN0XkI4-_kRC&P zxWllY+_K@T=;H{r3l>%+J}Z9WNID@W>oHhjUd@vIbID(=@!ly*nHe9t_wdKk=cI3f zAFl{KYP?cmF;HvcUOEKfThi-rArFCVgK(DHh@KArO)LLZ>5s_{q5~olugK$*13YpE z+(UvJeB8t`v?!Tu4N{Aky!V;FWSf|~SVKgKbUL%io;|+$6j{gqoPhA%M@a?;vAH=0G(qr&jbM%gNr@hoMDMF zEOB_{!j!MK1WErtFMXCwt;O`rZ5m6({d+bb?Z>AHA6s(I$Yyp%K&=_M)PWbr((BS^ zc=;u4u^E8tCF5EXJ^5rb1Bl{CP?v-8m>WB3iACzwU|Mj5Gg9sb2rClW^uJEi%VODm z4(WvGZA-XQ*h3vkH}?a$gnxcsdR@9f3|P~ipZAA|ghV?Na?B9nsd=^)TlK_x&>u?= z{Xp}y@~=sskG6e7(E>l98AIpoYXIQP4G#z zohsN}Jhu}}qi_+wxL`EM=DIV1GlK17u8jMv%^$Tq=?aQ*S47EehFT6#z62Yr0k8$I zyY=Uv*Z{AL-^#EG6L@Az17CUuC^p)KbWXmq!Q9S_qjO+Kj}`+T(6EFB6Scg|6sasEbl-$*u>rs_2tC^t(OVapXZ`B?fDQ0D=v7MYqCBg*{@ z>=5P6h;GkuW=nifZRP%94JLjiTvs^2lw3@(q__8u{k$@ZHx1RTI0vV|jYmS3?>pcW zJ>8OSTc!SjNV0;Jd`G%PpZQ1zP!nP_Y{ayXw=fM}B9=E}hxOlD*HEIGO)P!O_5=69 z`mc$U7X*+ZCyDzHy{Ahdz2THu2{py#Em2{3>G=fI%Jp&Z#y+}|; z=a{$Ls0`;;WNpnWsOJDehAT3RP~eJaFmjpP2Gm@=K%~c&jcu*4e$SAC7ceAy0II-0 zN^6*zn44Nk&qO@+B28L>jh`5?F>iUS!@FW9&-+>8)n2=YL0F=& zvYAuigQ`*Gk;O#~<+qu2GhHmL z;d?0j9a9=Nh<9Ox0-5s9i5|K)p98fuKyWptFQerAcLul*T3U+Qp9p9-IZ%E zsACQEw%xM?8P;Ovb?Dd`C03#(6v()2ZsnuW&lD9KC8PE#JKkW&1D{C8tj%t}izdhq zB}L@G@YjSgiQm83kDz6x)5rddWaYh^Dyksd-VxO)B+ksR0vn>9h~BWWVt#;RUi z=Y0bOx#aZ{DJJ3GY10Mz7}SJ`U%$gF1|6v&{WxbX@C-6x>E16#LynAhT=2hJqcO^Y zoRLooyic%CJ8J}TH;@ldyr?7IfrK6QB`1Pz3FR-OkM}1j2E(UeYC+y#>iE$igCah) zom^F+P8mh2U|(k@_8|-&dv0xdHKAOyvz&V8S`cNy5=;7&l}=_>G-xdS?|s7i?lAog zAk)P7XBAoA#bBUnZV6U~2o4CE0#Pd=hMx_l5vNM~!t#nO+kV2f&u{>{QT=Hw(qEcJ zi|xE4yxNei^uox{{Pb1}7P-db=Cl%>qgtK*G^|#nsg>mzt##@SCEr0n0r5o!aA^@l zWutjA)_EE!*v?MV$PLVk|17ZWmGuF%p}HG_(C7iJ4eM^W`O)Ac0au`j0sSkMLFiDj zjU~#w98C0xLPwz70bMRg*fKW#a&V}0fMk_5^2}MxBRfZT$B96zeKQ&g5Zzu4k2j+> z62!FK;ArrcCY24l{E(>s43D=&rB96Q*e!m^6Ru&;&PW;7?5a~N{CEa-E`2Cv(o3XS z6aIW{M?~+?jxL375o}>atO68ahQLb%G^~Tpq-Wr+w`3qWWWFE*?e<5}`X_a2q^^aX zr70~gear7|2J+7g$ebXsXc@RgDKVOYO22JuO(0XE@d7R0V3;G7$$a+@KDoszTl6}| zLxsZkVBnyg#Xei$LxR6l!ZRL5@GbjvYn|i=?BB*tL#~K@M;^XKg#&jNs&;HZbVd>} zY-4VjeX?M+BWRnU;0=51au9&k#?6{r3_7!;5jQxHE!%7a!fnh>L=7~z?&CQDU^8%F z5t>&}&rd*m6K{UPGCj7w!fwr50*D|MCG&c~`d3D+Z2jUc1DLVtOa8s4tQZxn@GM(@ zf!2x~A=oZgP{{%L`^~;$J|x8&kDy}@=g?t7kQu&ZYVlo97Yg&i ztyN%4RPdZD*MlOq2z<7N7Ho++XQpP&VC4r7Ul7(*(qmHIl)ju8oHyZIx1Rhb1Htk* z2hy_?Bs1WUxvNm*t}9ZeskOnLQk*Z*;&tcA@&ndhGlhrLK)1 za13Rrr+Gnv+hO_}4p-T5F5Z&o8eyVG0GCoLTE{W%f5` zkClG^nkiNnyi1sq8OG<*%0sogp3VWJjdsJ1HBEbX$&+lT%w)KtwQm5Co~2Um<9_eJ zmMKCP1QER=gyel=_Z!&&xn)SXRefhxy*Nc$oegQsrtJ(Q3LAXR z_G+2V(+dIu#VU<77C+kXju|U75(EXsEVnxN%owW;M!y(@@0pN_Q33t7&{}FNnY-pZ{%E`S$)6z zgzHF0KEVWuaWl3aI)()7-cohM~2CmuMp~F$Z%xnf(4+4q>KcvtSuh$TV7gm@V zuj92eZQew&O8|$BP3915+#_?wTf}~*#`gY0+2An=+;)dZTD?x zL>5sX48lplUQ9s7ZLq{Hj?Ii0v6Jpe?@PJ#5;V>V16mlC(et5!;%)2{znsZtFR$O?JZ6f)r*X+z-th&ZgE4Q?_>{ts{f!f2f{R&Zxz zzIq=yh{veOE{XBdT9L%G$F`v8`y8b0o_f}(Sq(6IwWs9DL# zX+~^N7^kF(TfVoU>bYR|Upo^Z;@F%U&cH%aYW>L0x$C8nl_g6Vbj^r2Hy9T0?iqm~ z62s*N258CnkU2Im+(1&0uHP>wb`6Vni48t6QAGiwBco;{%nPfv$Rf}S>9LJtxt0PH zAln2NzoF{p4_G7V%9D^_H^`Fgl*$}3QZF}2cNa(Qc8slFgJLGY?}j9+=?xlXdasbP zLzc!w4Q*Xn?lOeX3LEh^Mdnc}i)wbwGH(YQ-h#IZ%DHFYg)vo%(K0}w$Sc0OgYLM7 zIhKCz7W3+V;v%S+GI(z6O~-X5)lA zS49CUp7QiUMs%(^c=t`g|DT>zOqwV z#2(5#l*(|ex0n`mfEXK^%{P_;9u8hT(EEsbHdMe0>gp3qbH-_84W+c`e>t&{b`ksU zc+Jaj0}*4-a2+DpDJ|hkx%f@Q*gb!4+>G2v%u7Ji6~2GcpO81Q0Up!!Fk!`$VjuFI zAgE-62(%)#iWZ)TeJ98=5s5w-O!d#rxdz*12CMJFiuf5%Gy*}iIM+)a;g%i9v9g@5 zHegoq2sv2P90F+6ODpn|6YkNoUXCyXk9uYpYP`toy5!xLAdu7&8(drYwdUoAbvRl7 zQxK(RxTJ>kZ$dE9lm7(5N&4;NE-p=NR&2P%bs0=U#UPhX$RkU0S`7vH+!*s&*zkZ< zmeSKuwWdbEY}qNf4cC-lsCoCXc=Fh-nWa?+`$4!+l8sR!0}szJS`B{wVi3lq#W<&D z@EAs0j%}}PvP?e6L)>)+5O+kOU}9y6asHLB;?w~1H9B7qPgmHxqk$F3NuNUyL4&O< zxtvKW>ArN&XDMZ@(9S3`JK~|joMCHGK?k073beCYc*(9j#vyI2Np)%w!_`f_Dj#9N z78;?9B0Wr@GOF0QNivFv1*t#+<{$C4BO5sEmUD4}2oXvz_)uSX!^=b#ERI88qvYgc~Sh^h1H2e3jfIW^3!U)(SY@-0_ zMljqtF!q53UNb{zCv2eOK@iojeo<~o3%%g&0^>TAo`T)e86ZL2Z~Ai(GK^Wqrh7;c zZuqG!YK&klTIR#NHMUqi@4GUpMujJi`xo?SMt5HoZ=OK13$qvv8!g^E#2ybg@+pD3 zCKik=v7vc?ynoMzMQcmcE9V5Ek?FJ^bq_w7VObe*c#ZWF;X#xSMo zuOl{JkHuFJ{nVh1YlDu1!yH+5G-7q3)0%$~9{Um@#UpIZ6AnKUBVD#;u7y#V^|QQg z3&wjEz7}5U?y?8U*^O?2DEr|y#hl1sNW`efu#b$)xrMrT z0Xh+FBHi<^2(Vs&ed@m(tk4oEj%?uAu?7CEMfFP5V}==YiFDUz29##DNOMEJY6g}q zedRN1e>>@UkegWch6rSehg$A$fT(m9_7qb>q0G8QD&}MMfV|rNyR?Gx%noEZvTn)R zgWw&dmzrxD%PE5Pl%XBt4j>G0;FnwDzXZFhLM|gfWXnA(%=OOfc5O>D8|2;<*jhOH zDdeqlf}xyxz;qu%l)>Fc7^9%uCL+(%ROnhPFPh+aa&X9%UWs_~`8q1~sT^|&Gu zIVB{@@cvKYu;!SGLu!pHwnIcUfdW@f4{aIklIjLQC%Z6Mp|BuY?2fJgRa%SroS)eF zK4($=Pc-@#Yo@N0AmR}l1=89YO{~*`hMiMNci6l&PAbE_of$de>iKg#b3~an!)^|l zPZ5B?h36inKYAdB75ibs(@)G5W{^a?AaVjulkrN(Cm)8(x7N!LH7ge)f z`Dq~fa9|5+&vPq_dMMr6_?S$DcF!;&w?yJ=FqbLpq@#X(II;%=T%20?6-D{dc;MOK z`E%^!xuG>BtFVURf5htdu#C*4-JqqBxU|v(>F3f^dTGCo9v2@mk*2UhM}$HtAv?Kl zoNog*7C`?NXV~(n>9o&0a z;CPV2!i@m`f{v3p3U~}|ytO?Ol!LNAl1&2xloLJ|Sm^*avpjUa2x4UI^ChPl7saSkv$3k&&c!k~_Y zRj@aNgvAb@vF9^z@tK*KE5U4}2a>KG5l%!k0SnGmHRb&t+HsP(owu~>p1TjV&RuhK??%0BJccrL|8HL9N zwCBj$4QqiK^=be~b)xj(^pFS^YYRok0o<&gS~XTRE3mkkwe}2YtYUdG643)*H~_aA z;q@pi$ss-~y~5cuk+QS|7nkh*a&#D9SkDRiS)+Y(_>|$*4hC%GC_^;lTRB+dgv-QB zMW;=de(t~@;;nW#%@$~&#EoyS8$}Rs`MI5^x3N^zi@1{knJ%LCIbw^!AwiyL31)V_ zWMM!T5QYk)?xkNm15sSSHXdn2iqxdSzMO+LTo{>f$+*s57?WGeM?;@d&`7rs!3wLU z_el5)gFT||b&OTt;#ih8a%lJoEM`Nj6GCc@2$)%xJS+p{AV=pE151k-gp(R{@uPqzMKQkb%C9+*e zH`$j_md~x8sJL?{hKxTdth0M+1NbAL&IVa{MKjt7X-mm_*Lcm6t-CNR$#26%_eUp{U&H8N}$q%R|x88?^^_}L{$dM1a@q>aAQP+kA z%&?m^9&U+BG@}tvg~iYDVFfx6E!DEaX*wN?Ha7GwNBiO zpT5L~EcO9dSDuU*D>zs2<{5jlBBu(rZAUNJ8Gk409)VeiV?}y2ww7l;XB22*n=3HG z%(^MYi9k^bzXVGW<*EhabOKb|Is}nEHKw<=Weay77nEGqy8O4uDU4$PX%Ud-RwA!P zpMP$=>r?5Tw1swdKtd8;y(h&21JqD&otioprhKze?;Wu14s*3YRI<8Hz`^5Xkio&tlWau(un_!)s(|>Z3cN%jzQj9eX%k*73RPfmL-gspvJmE4k$Y0 zmc*`%*xIW>*4#WLWHs-=iOd;KXll*}EZ!-W;<5$vaU7)CI z<6^|}YzP;l5u?MPgWotY+QTKDWYM3L)P+ysdn2)K?S?b6UICts)B!z$URvQe0+$jn zH|WZxcM{e(;FV3?6iGx2c(Vg$8iGeV>ArLUVJlucLXiy)?37y$(1%XP=f*z6cneTH+|_g(2!@>g%P3Z1=&yvs}8KDm+C1O zF2x4sQi+vn4EB2{Rh(Afk9k!?+LH2CtX3F<9ro-6OvmHVfwHlSa_gQxmhMTii8$%$ z&S(Xx2d^FdB2i)r7EHYP#4x7r&QmkMHFBl%S`yn;Vh_bDt> zo0;-dRzukGbh~{NGysaing;`m4#(Wu&4<;WsH{v0I>S-zFr;f}oEP3T*b=cOGWp7m zWxrrMxfR_E?%x6r-LV?4SD??NGajKH;D6mStfhz8kODkzf(M@Bz9uBt-5SImoY-j= z5h)jDgDO~gy*gsU%496Sgp)yTQTtm0!B#@bC5EYY_;C?^;llBvmVrd zt+stotNToBg(fDKX+QTXJooTDw6*G7OT1S=S1kzh@OsH(gW!O>A{nmFt%h+2N03UekdA=S~c*&je*vMk;ovwq6`+-G=^mzvF$b}buW^NEu48|gfR~pPc4}1m0w#n zT*+Pv{!-vC1vYB@)7Uivu!Q#4U*=R&doexT{^nft! zl7S>sI|xmr`5Tfz3Isp72u?Mx!7yR7+v!O-yq4X5W zv*FF7UKycJAX+2)_Z&J`-Y@rUY&dU?c6F3m_dzw!EN&ZgK&Lg#{o;g~J>%2-&yTPAySyYC_X_=^- zq>^a$C)6Hm`>W}G2GcN0`4j0S?iR$jOB~w`Muyx28|?PrFB!fQe6Vo3OBA30n#c!t z{)pjhr*<%A&SgQYE39QH8wjsLFjLz%Ia@c!jO>7!dKh$+Tw`89ODYuYR5=-aua_?4 z4WoS8SQ#V*AQwhIe=ObKCz2!)ejvTc*(HS;^UMWdO+meVH}kb4zqQ;ei_rL(y0Bh5Mk&&YugE zJfNMlZB`iB^FcFqMb;81M&OjWmD?%;-ysDlMEI`s0q3Gsv_h=e?Q4$Zo*CqH^-7o0 zucVKqm+cUn_-ATq;Sfbj#IQG{H*nfhba7^*1GlhRLkidI&Jx=-0i9yigM93Bz@6=lFTS!#}yCd3a_8h=UdRO|MbS!-yoTj!po>Rln(>A({ zAo5iDiS*yV@DA};!PgZJ(+tM>rmgq3^c}YG5g6iv@Gb1&T6*B7&p)wCE9nvAwStcp zRi2EpNY4gDu{r>ore8`Fza9;8c|;J64T)XCxS9}$PEgk%G7KZ&*Z^-zKag%RP;){Y zaX}~Hlw~|+0(G}|?@2$FUL*fIu)dguRS-9bpzC9}l-`$qB0YzTp0bmVZ4d1%b{7S${={A)6HD}@+FrX>5=blQZ(TSQaena|>bc;j5XGZT7!atWTDba5I;`gKcXn2K|;u23rC0Tm3_HogI6jG z;9BDs1pQ++8UUS2KbHPhnn*98vnvWKCsbu})5WVToMMS6a zZCw0w&P0Bi7%)sw2?TvX9^d29yZjyJ3G!gOC-dAw;;s%I*?G*kNU7Womt315r!M~) z+7c00lztj|MuU(rao4FqBKo29ckHSqy0!rZYY2|-OTU!fA>zHd_?OcE!ybMOHCov* z#mTLHB>h5q2fF*!_wGtRl1`)}=_NAkIr+gEa@$ECNbe73IfwP11I}K8d}P#nO5?I* zs6h_~PrtXIlsPeDxPvzr1cxpNjof3)Ar7_eo(79w+8T@ipf3MX`V;9zBE5}tMn+d5 z)Q33WPK2LG-5Cq2s^iWf{f?pM;!rFVVt z;^S2(5s;L0=PXI^V?=}8hKHAOBsDYKUj1ViW&<95D*r#A(wgCMSKMF*oOqKH8b;** zvGfD!hIC7+rQ49tN^`morE}?L(l1>9chbkiFl*@x((~xw%p!zS>0{{)>E|?QUS0ev z={wRLqK9A%Kei#fDQ(?Rlqba(VXr`!y9(1`-@+OjDsEBf{z z%mWJxOKajO(fqmPS3MR|H2TWQ9XlG~Hk`^D(CFF`lwl(EHYkV4U5^bQOp3)7pM0o#FOj(ieO{g#wV7t$-z zbCkkYBz4I=o=Lxu-rAS{o6>)jE~QrqS)*Go$S3!u??~^s14#0JhZXva^g8)QnDhip ze=Pl2`mt|4fBu>hM@|HlLtsk~$jXWoJsmC$?Gc75Xgf24pmQR~UN?TgxD)9HfnR7| z&`F{rXK@`7D}~~JFKwjHU`A^0wxx;XSJE$}AMdO1+fpOllirg)V^eRpaJ249zm(pU zeg-JjTfAibzCb^D-7glVF0Ox+zQ>`x!{-kOAvg{dM3NXyR7mHX9NIbHsMJO^xU@QW zYxLq8>(#A)V^M#ke6ljujvW=P1S2s}a!YRj^?io)HDyyDQN}uxD(Ow>7X$rINr0q# z(rZxA3aTPOp#50-p7iG6;dX=wp}0g+~oPs-7xj5zAx}0S)!KwvAK>!EP$Y&OAlzt zU0O|j#n)ES1L+g+=lp??`9TPguW5o@?`7pGbEH zcNThb0S7U&^$RNhfi;5<>&Q6}P_RH`oa!ha&i46FtszW-8P<582wbB8)1mKAkpF!M z$TRduq~PaN3?J=_Frj!4rJpiTBTVWJ(IoKb(f;C{NX$^M`N?`0k4?O4l zFnq9u94o8v*OAMuiJf~MxUr2sXQRYBXB6>79q`(2{sv0dNBv4;WAA*e{A+NEDBSIv zyD;V>i?@<~C4Dm3-z-5xOt~9<_*kI?haK4s6IOeJkl<>5f_K8q6e!sk?q_)%d4@D)uY3n18i3K59Lx!k1vM0}(d1dB!iUnY8+r zbsmMU3oRJmJVB(2qSeYqT%OojUNt1qVdz(p<4urg_=AXqW){qKqN$0})<9KJI$UFz zW*|CIF1}X(6S)6AcgXGiLksJH8S{(N;^nqyV+lQj87~99N3xOrIybWycdqvD91!#z z?{R^}4p!`MZ zzl<#4_L__R%4=7Dw@;A$^)F6xE9`*eiTRx%n&&i}=B~r7g$!E&mvc!(rm4K}5d0&L zB1}{tq7@;=>K;s=a*!1S*c;Z^rs&9nV2lwVNgZZN;5rEYVCKMi8*WZ>Icx`jTHHqz5TP(hqXV;OQ>!e%hcu^}3X zW`{Yf-V7ivWI4I62H;ZIJaL7dB#0L7$#1N}e(`U!cyU2=4HZ^^IcB87l(a^%ky}b9 z58;poX*9PDvo$7a7r=#2laXyozL$*LDYH(sAN`i2FY_ty`l+Axx4!t(UCec##&Cii zN$%3R8Du8=gTITNt@6J^ZxK1!%3Ksrl5``CUc$ zzvAKyn)!hdNur>54njF$?rWCyRO%*LouIT%A@DZ;{ro^-%aCLO_H4gTDkGEQk-r5%MXGtN<9 z76GTWmQ&^U^X+v!h<_RR{|o5aZqtzj3#u7RV=0W}R$1XQ!pt-2l;ry1pn&#!^QlRV0`1%4iPP4I z;T|)_ZR8>Ug>(znNrl+A7IBu=<~bcq@@uUXBT$bi53;jjVj|@&oQC^*;s9jetY}G{ z^x4EwpYg1o7|xv=U+C)QEva3gcrgvK;NrGF4dnN56a0he(l z9gr610|zjJIC4riO9q%<7{D{@PqXdPhxpTIvMZpal(t|iT<;TW0RKuqxv`w4l(};A z##R4x^V)l1D%wSFV_%#S8nM&Dnqj;AdBZWoTZXo+u{g;qL>#)tD*pb6J$-`Q87)w? z%?M8Fx@278hD$mJFt?`Z!YaTOoaz!_f4g7A_AZ(Vw#Qi3# zGG2cV7w;AEW;BS(jQAe^>EElLFbpj-?UgYfGDI3IfDsnN6s+rlHQ4cUrBx=Q)wH(& z=~g>t?;Y4>>DwEtPxfqhJkkv7e@Zt(DncF^3I7Zl>ZjBIV)lVQo!_s1!cRm8{AjXK z_ZdaQPi{ek-?=qaYewZCtttNpxK_N{LQc3JlWnqGPS?<&aox8&|oK+p(Q50 zz(a2ahuX=SI^gHInXbWfPGEOD@^0$E#kZu}XvxA3sujiV`_jYx zp|&{~P3Fg?d2JOwCod2dBcT>7eM@Z9ekV>qw`-6d=e(`PT8LFw5{CR^uJ?5YF zz6ci__;q4!(53C03+cXeZ?q96{6iV{v8>Wd}@{~XeM19 zJ;0~dj#3UvYG6J)3eStdER<3CpBQ_wwP^-3Na**ZNBi=RR3>?u99F2v&J(sCO-1eK zoez2ExgD;vM*km-u6)Zx?K2wgO4zSy^PCv|SPbS6h#LEh7_~9Fas!Im8vQ}OdD2#U zD4p8~?WpoB48!(eQR6-!XNb!Uwqs^hxr`OPCw;Q7g}EKg79%HQ(8!ux{q)KDe}kZ} zF?(K<>yKEz(84v<^k|`77zaGnfj`CpbS$i}us*{rV_(fErEkr*bPg~Ijaz!9FAWIX z@~XMzciqM1Kn8Q^Md=Tu*QGlwZ-lsqlBAENpGp59z2}KwiZou4{*Cli28!NbI6{V! zejt5M`mXfh{>7h>z9GHH(dY%{F156tO82C9r0)*yEG*-Nq5o^MTU$VZ7Fz6TKm1&f zC$elam8|N&2Sr$I=VZR=RI-a4KlOC%qwkf6z#i_T9_USEbKOFQAwwlx4Tf@%S+- zKhg`)u%6MLH(|#`9`T6g{B~eKTDw~S*lQ~q#B8i=beQ|T z{J$o>&Pw=g(BAIc}x&kTM`vW^3}q$Ca~I$bPO}PfyuiW ztY2;?q%}54E_x9|f0{x0b6@@o^#2ReOW;yl%e^Y;J|cX_r4YvJYtn1d4I3>Lk<2~m zL+O3#htda-w688s`u{cQOVX=S$?0U79nzWFWdDA!cp?ABGOgMeC_COf;^}Do>JfS3 zB{x>RYAlk;Z7gWzVw+;W0^o>ly=Y%(q+_PM^g`dE0KX;umGl+qL|QSOHOEM{(wX!u zPPluAsq~!mE$QD&UnUe=uz)dF=&n>spD=9Wibyk+UXcD=`WxvpnJ}=Ga z$a-EnqxE?yy(XQJ1oVQ?&S)d0`Ry1bQJb9(%2khtzA~rX_1o8LqPD0Y?pY`q-xs0L zh9Ky^4Fzv`id*roB!w5HuS)-`^aU7uF$yHjDO^jhK@PYWkbz&8eqZ{g^d;#fsgpjI z#F<7Nogw$7kEMqL`M)ZCL;5r6O#&@RM_ioX#5Aa*=FYtsW=5be0m5970q!X7Hbzd^ zc^tT-|FCB2g={qYme!^YBfU?e0=zrnM0r8}+L^$k%_;0*H!A;^r9YJZXX#69n3mzs z(Wk$X?nxig&)dno?>D4>FTFwSw6m7r3=#eeuf7fWC(HMB>CdFMq!%rHtI+85+dl%t z7|Oq-P%_0L#6RZ{(9{Po>|M9>7$+#-RnX`9#`EZ%W@AlrpmZVY4xZH0PIQ7@d|;RlRv`wVgCF z2ftY|NLOT=YuYPU4xsMPl^~>x$)FV$ub@bAM2mKFuhIj8|)H>!pc@ zKbC$VU4{tnNMDxzMEbIHpDh|x_>xdQr61puz8k~L(#2ntzA62_^gKCE^ijkzhK#D`} zD+2T{NWUk2L3)|(Is-H>dFBVw8`7twpM?C=;{Ad27t$A)>Q)iKh9Y#*Q+k%?(z&?F zJ?SgbH>EFfG`ZOHF;cCR?n|ZgeTGI14agBqzcX8#oESjT9qZ5zsqcwwsTb)1?^&@g zcE(~!Zt%>_NNxujhsWPwGz&t`D=qqbl7ltTB1?SR3n9N-)qpaLjfaLAH z?JYpYLrw_U3X9!$n2PNo5hNPyH#RvYI6sY5hBkv~6dAkEr99_=d4c@YW~9migdwcN zo6^^%FH3XeH^aiUT$4=l1@Y-#gr6dFet^+!NPf?ZrJX>TdvRz=TD;e!FOz%a9L~D7 z!fkN(mt33=9B^(wTA&`$7BK||-WbT-3*u+i!`-qdQF4zWOWm(0>=CZC-H&iL$CN>K zeg4JrFn4tQ|2jS}BLF!vOSHpUoMWf=BK)lMXVTZDJLd7{JZJRrZ@^`G6sWX#Um$eq z*vSnw!2&Gx#1}6<2a^n6=Rn^`A#&qibZBxn;1MUY{zd8cq}S~z z%c;H32ID5g z*6N@wiQ^V-^wJK$ys=(WMn)Xr=wupRQ9vpz39eoKTQ=#&<{i|Cs$(elt60M9Q2t2^ zQ|Ynd9G#lS4lZGOw4M*mrMuD>r7uX&6GQH3Jc%Zt3CfmR$Iy*#zb?H_64Q{Q)SN(b z2Y)VOF^>4Kl32Hk_?9rHAr;C=j(Q^SAu>Ugii=OME(t-oH)!UKmG3BHW_F63BT|)_ zseTbSEx?P+AKqCTYhh=S%1o^f9hAgl?fPdHXKFoMFhZzkQ7P9FYxUN_+4whak6P-nF^0#!kgtIh`nzj-;2R7p(zs4VhAw90^ZwIViv!*&(DQ zRFTfwb7wZ$VjMzgY>`Tq?FnyQ0f9wk(4!#+-~WvLy+yBL-(B-XQJ?H$17t>yi!pvX zV8hIi&~f}NgM4*_oGbg)*ph1!jb0;8j&TbbS@|>>o)wB_~{sPErgO))gDGA>ZAx!JJ;*jj|C6i(A)r5a0wulF*&| zg8(#VtGAe{NS2}xAqw)1wGx;E5Dw~;mcSkHLez0}%Bf%#)DH0G}hKx2LB~+jU_s zzVPr>R=A8hXv^e{3Fc{vN}Z2JpsuJbqm>H|8WuuHGCqgzRL1j9nBr1d1us|}4Sz2P zPOsdC6c*`bmJnrV&o!nyI11wqxC5#yarZ51xMfU=Y&7xy$|*&}BNTl>?oyH&TuB`P zWgv1lIKvj>++z1@Is)g!Fs-?|kqUg=YGucd$k4$F=<^i^>g-iGw-pNVQuK3fh{l77 zx$h&%h|6l$Jd*T^XjVK(p!FjJ)CtMPn%Z>`P}66I{@2#5bfbBll^dQ)OURTRJdj8Z zG6RKf?p*ZGpgiWga&yTGe(i+e215umCpvy&b1VV}2ng+|F}X8;{vcZKc*DxpJkZ(o zppPWt#<@i~p;wI!HP85YKFVT>F#D8zdCi|=3|u^B?kep2r?GnX1&~9{YGrJvUMoPx z&DEda99m zZ+J^XM1wsZF+j3f$$6i`w6jjW8d;QsX$t-o3Aa45#?Y3fo`AwsgUcosa=Kbpxbebr z$=zWQFJSzbhZp51x6Qd(>wPsU!{P@)yaEex8eUFdl9R0Om*84@529wq>(nHBvMT?r zWq`t9m(xC}D00^pAO4Z_#7}h$aR0)>h=Nog!G=@3(xr1m!lqwICrGhoKS$fw65)+| zU-8;gb3;*^>Z~47)2kDuhW!@KtX3o(_|B4r8Gh=*U!kyRte5a!TWh>=L}R4<8&;q* zhS!?tvtkuQB)PVp*FB4AiR&*cU+Ro$;C6e6!nU^OB7<)*mlv#GVWTuhP^^lcx9I-u zxZB2(>skNA@|%lex2J1X;8a_3m(rwI4`Rp`dm6w?%|cGBB)Byk;O5iwBeGmP0)!TS zSz(^mhX zca0s?LqTfeThHw5loSUpthjVU9K1K(3mcmpG|Y(&Zd^%Exj}gwjWt7)r)I{kq^H2B z;xX&N^`-f$9Tci5RzCgq32$2%-{j``kIAtn1e-w3_SMF z&Q9vdfPIfq#Fd4C5eDt+*y+!u^ebscuk{oMxC5hDOOJ0GhFHNR3f^lx=Ms%Ll3oN+ z8{#Js#uUW6p)E2#r!(gFk_)YjSVF zwz23_(&M1H2m3L@Cr_+RWBilR{drW zOR5|Zn$L~Y6LEBpgg*K^P!jAz3a2wxdrLNxp7t8i9(gHu zhnG5nKs~pvgDpTxM;O(Mo?;2-R03z_TpFIa0?3)7&H<SJ2so}aXgBfzjTUiR zqX&lfj)4|YLCyRygn|C;eOijhZ93e*xwIjgn-kD)1{hpv2U&MS z-4W>r9j5>TztsUGZzN72o_3XwZ)=s?5N4zVff?(+qqGVA|URF zhFdG*jX)5ES zMf4WTn-+VV^~1`&hIZ6`TQb-{ajzIv)_5e-vGSRX{3vL?3+!NyVSD`S6a9jJRGX35 z+QaA8BC>yk4d7EyhlMUi=`K=C3ksD=`=FvlKW?eXJYq4xt-0cjGfdHdtO1p4Zyf3 z!_o##mjonFtp<>G;D+fWXf4Jv8jsIlSWDR73nxn`nC0iUcwoDKL)KfV&VtsusK-1(1g$%;tqm^d zaHA{k7SyZ&gNJ!!VN?l6o2J%4+z^D0AkkrBf;>}NC|3}n%%Fz!P^b!NCz+j?#Z7Ox z=7J4xF?@s>N7*)3+1{B0jvCNrG=dGB@yx=I8Xr{Qn9lH|_hT1#)@QjOT#NKshU8Zi zYWL3{6`P7JG_h-T6dl6)CyQz#i9*phMkd}J1h7=ApDhUnIXT>i>47ruwAC!Mx|B_+!K7x)N8PUhkcoiJE zy(S?(=H>x}pJO?Sf%j1CL0QfiTRcL+6GKupc!AR9{iRQGfA{dIkD-BWuqt`%o*d-k z6yG1HqO2N|9%s>?l)C>kAg1w|QZkICD{MeVuamB{%v zes)jSSz^7$i_mLSy3 zi@0OylbG{&F|dJ++4^{~@T3i07H6i~O<>Rk*utebfP!*U#{I)=%nk#{&H#&ZNjQxa zys}h}`WP#+9rV&oxz&;+?hXu&47;~Q`G!o^uxcx8%ZB%lv@l}3;2;k_fy8e(hv&+; z)f;I}0dFTg@)$1Qmlc6>3z2L?Bqvapp!xK+y$G}-IL)jo+A%{j;~snO(aQizW@35@ zmGko*;%%&~bn__;XXq?d4!T*Gn+d&rh2QqLKSA*tp&t{6tr<(9(Q&9DlgK8vh9zz3 z9xPD6JA-ca^pMF!O)d9`%4CPC>_FRlB=$97s<wah^bSk1h720HXQbWX z$(I8jL&ElQ!)(M}5Zg(Z&R!CgSsQtHk{!h6)&`8v2I%`h{*m3L`#-Yb05cXcg45L? z2C9jXXM9mOy$k6hR$im&UxT^3V_osBVe^4;EJo!oVt`P1C*I2(a}Ibi3_z#mu_J=d z`WsQUQ>sXRIt$!e*0vy+xgrnG5L-SP0JRAAX}=+51Lk{|GMLO#D)eWC3Ca2}#g5&c zktMGOC{|z(A%&>*%`Bv^fPk-j3Cn>DXXKA-%b&xl26)kW5F8lXit<;98A$h9*otRH zP`UZ<+|tX4-8NXM;OtybxXKPApspXpw2MEqCsNfz#oX~U{ zgq^Jo4#;hML}#x&e1!rRJ0po%+lYgPrQJZy8i9dp+o+v+!fQ+CheWne-VIM4+03!U zr!{8qRurXX&1Pg^kEL@dlOABVTDw!{5Z0OWfUu|~7zst15c_3-LbMU0$1Q?Q)_OzW};B(`zQTvW<|eJ zl15oTD`@#GPZfN!86kSl2808sjV&TvW}q+=GJg&5lL*j)`z#8|E%nYYCPVzm4N$=Y zw+y5#OjZq+Oa^1_7a-ZdQ=6OSMr#%}AOr>=Sr1gA=CoQ@2I1fD%w&)hx zu^)>5VmijK-vAxIG$R-8P|O)cyj*%snoHkv5tcSaEVuIzntmH&N~Io3ACFGsh%_f^ zU!4i4G32SVmYaPkYHkBH3Wj#2^5yfKDkucp4z^CR566aHCq!aypwN_cX>W62L2gVk zKPP4BhzajlH^M#)$$;%<%ue0*Tf4%ORamHFk3hGa6m&)ZZTwUKJJOn<9vxa1y)Xsv1Wj`gA?UAmM$K=Wrd*Lma> zQY7^Rai(87CiV>-+Y*~@eH>JW#k}BAqks`bqXm1Ubtz^P&X25jWMOdq9B10$iAFk^ zd&J=SmtPw3{{?biKI25N z7Azz9S4VwDLBxL15p?qi6WVC!MC!G5gw5GE3jmJH4QRnpJa^bJ*K{kDnCl2DW~18Q z(MoYefD}wz5p_joBhMNM?`SvUS#&q0EM&q$gjFhVPkK z$&lX`688%8)}i^mN>0Xmo}kVFfwaVN6KmwtAOH$TW@Eb2aJG~9V-Re`3&(Pu0x~8fK|AMpgSgP@rQAdyNlD)t?b`?Vqeh`3Kk^nCv1<>gW!&Tt_;FyI9@S;l!!)7OqqMyy9S}FY>LX%d}T0d zg3GnHfdP92?6g7c!k;8vc*@Bz`^9Ssq90=M1FlYjtgMJ18+arFm2qxlCt6LZAysy9 zIf$Tx=)AR+KPEgV44Zvy{!Fod=5~HvL96ACoTg)S%R>x2oM zf|VTSbq@^-7(Li+b8CgUb`=VqO_bguO^9qZZ0I>y$B6!H!MqzV%QLgs8{&!1Y}9@W z?C1xd0VT|Mh463{rKXEP5I@86MLn{@*F>zhCh_SN084*?h}WWyduoPqje2hnXA%|` zBeq5t4Q6|X=t@hrZ=@eKY~FX@l?98LxuIb*+rW}dtPyL2#c#>q*MlBl*S=F!q$F5K z0r@e%Q&@1mfBu3PB^+op@~pVNBCnqudT?0;cg8sFvfmaE^vaS@rt&N>YOG;gmv$Ca zgP%(7EoN18zWZx7Kua}m$(^GSMpo7x3cn^^U)a2gC@kdm3fsXvzbJxjScpb?BDKu4 z6c2JOJ+U!O@+F8Sp%&h5P-l<%r`)2!5fm~0q~$%vhP9@AYY^U3j{yoh3giOGpJ22~ z+gD|u;=Zq*IK3($?iQE0ppCS+&H5)VF=KL7MP?VqB$ydjHqvab0z1|w=WUl(m5MxM zi-+B7f$!%Moc5a7b-@{qXSmeaVJCConNhBa4uq(9wM^)%=@Z_68q2E!r}D_SVUIP2 zBJDrHgkBh9IhtSByz?9%Is(C!XzrG;o3Qh9#4dOYCC{~2zMM^ZMy;g+IBB>z3jJ&E z1da?Q3GdWE^?-6})_kOi;N}4)9LTx_`U>!*hNWBDxe>KLeMu8{0F8}Bs*MeYnDl2Z zNpuT6FFnshk*&27)ZD)@=QfT)v{=8&FwE#*5$WuS(d%xmY>X}$rZ)`bjDLzPS$hQC z!p}U#fL`=lz=C=ham)ydqwJp`k+Y12h2k5WQ8@@7Qjh%;={d046{3#48peDx@ZMpA z*F@73iY}3P%c$thBWMtNV~O3V2R^**w?}nc6Zt|(R5Bg3(1_oj(P~y( z!h6+!&xYx~^U(;jHhjP1o#VSY)97I%VQYOTQ*%j~-8u5GI|Bn)T4!sGY$DKUftEU2 zeXmAqNP(;`>}1{`(q;T-vZ^Pz#QplU1OVZKW;}TWc56c1ky2_npw`+vDrnG`wrBED z@0&we*pTeh#*Run_7j%5bWQ2Z?ykwhPY5ZGBv}*7#$I7Bx!jn^wZYa?_R@kCJv#JW z!hYVNbhTk2E9=LevNf8~7&gjy(XWz|GCh?Zf(Sc1vBzQx6utz!>&#|Oh-Z4(jognr z1Tw`?<*>om7?u=-eH_Pmxd2YAWO8jID=TurIpq8vgi=xAKQ<4Z9{mwF3B&j-6eT3l z@lfLW!ivj*lWtx6a=csz7?|4X&+tmeeR#j8`a82}8@iu!LeBV$2I#%ZFT?tFMyuZ7 zk~2`r8fVl)B7IFG_gEMgC^fmvG`=GB5ZENK7f%*c)SfE?Gc#%XLV@DFEx#?rKdz39cE zmN$>0$0O+>Vq05ZXA?IUKz}MU6pY3XenJrZMxr-iWG4K*GQLWQBIVp*?g40Ks!~}G ze8Da(fNd&(ioh!NCS_@p_=2+)@x+X6cVUWrV_5eqnHwHi;XCF?Xh8&%Qz$9J2}O*a zUV6)REr~RORa&xiy(;zv+ik+x6CD?1+%TvmBl(xBL1PW!jRng!+FDo^zOMsCyXX?n zkb`*d7-bp(aj%XQTW>|kJ;mb$q#tF3Y%t7z^pm3oCAXa!xqpX5dobW}f0V$I|6LQ% zSBR!MG+;Gd%3B-R5RLP}8!dRXeDty7D~>Rx$L6#e7C3r7H(orAf)Th!^djC*cf9?xmrIH}V&eOv73vv@U#Aq{300-FuG`n>6j8q%mXt9se9D0;i{aDm|ig zePvKVXAZ>Uyux`Hg36y;*7O0>R37xlBkmLJj-9`;IC^dPM-;?6JpQ=+Tk^=vP_W#} zASW!x&KQn2AVSm%r-VqCq;f4=@Gfc=&ZG+?mlA|m@O6=jZ=f86KOd(6bwoc;EWzt2 z*3PlMg|%4q++fteR@hwE%V@98`*`*K=E8x@Ex6uVhrN(|)L_Pnz+njrV)|<=xy+Te z))BQgKpihoS-iNj)O=wa%ANsiX!VM~U2K$z)yF27kr4%zcQ2E7L1lC^VEmghwxhq%}P!W z%q?;+U}e8fL*%Xvbc1sp<6zg9;neE@N-CBs5FfV@Hap zkvQ%!_cO@($JU=PLHT+aScB2t5dNL8@@sB5LE(D*mCj6GZa0}(FcC%QWTB4_1BfUC zT_M#KMc^>~0YvR=wqs9(S0G1QE9kepQ3~vyNKgDUoK$6d$c)z-6Yv7ibEK>bs35D` zW?==xDJoHw`K^p)Dy4gaN=nqBYu3EM-9{1n4pGhIP=2K~kz^2YayT8qEp7%~RIdNm z(g#GJGhnx1HY|9l4Q%OsD^3s}o@g7PS7>3N^7B_fd z)Oyvcy#r_%h48YG?$oD&b=-aJlE@6CxC4X`C{Rc8GBaGH7dusUPEd!Ga@!J>ZKfa? z4J%MjrSD0vNoPjIjf}RlnS!b3ch96sYVkwq+tPE=8_4%AwxriV zEZcG(!xU&_pT4;Px{`h&{X}}zRw(+$1J_Ald*%Ogr?*hs<#Ej@<0lzt+8PkIT{ zEjwVYG!(4ng@G@ zzXx~HIj&@Zvsj>8!f^(lt``IrpoGU9+qJZIh(JXnQ@R)&C^G?e+EI|qppu7+YOs6L zLkEy-$j_v|k)G#p*U}CWzb2+h48xyEKZ726<3xWZ{U;9pnn;hOiS)erfYz9tau2+g z-j@DPX(D|?IoeatxKcJTZ%gzB7{dZc2>7(C>7zS`%Z$vDa%OOozOXVkGX z8!11rX1$&P3&eDT{3`&HYg@uK3p8|KNeaJ|z9qdOJ;&}3{2|q}@4+kEc_}k4{u}8f zDt!~$3hofKq&5GT18uh+0lh8#71gnqvBw)~_a%684o5(Ug?4=Njvd8leEKq+dvXODoND=&Un@OTf1^ z6~h}5E~TGH|EF{;{WfLdxiwU*@B|rVeCeC#$I|~MEv2^*cmUO{5o<4`kELI6po2>=OigOy=Y|9!s5c2Oupe-xY2&L!3hzcLgf9GvgZgS$MKAM8g_P3e(?} z{#JTPdLX?>>v#aZ5#gmg>XFNTBfTg6o%E`7PkLadhom0aqzKpSob$ov`GxdX(tYX6 z$UiqUryw)?KzdvHmD``a4G9*))}oFekp}vEWSHANkq8PwYene*FDf&dS>LVim%g>N zo=BH4a!zc8FK!#AFD(0aIDh`jY{TSwo=}r&r4OVZNWX;Ub93=y>DxqlFQWiELY}lA z(nI}{X$YO)`Mbo(uUSt+RKTZ@;r>qgVYnXG{(>AKm!1%l9}&m2NH;TwxmV-B_Z{S! z8a=MG-onVtdh##HIj>0YN^DaE^qt*#ME>W}ccll?k@RVU-AYo}9hA1emwx0@NFw|@ zsghRGZ%LW-lx=(p8~R*&U;4iE_F(an{L@4J4sC;LP7}G53TY+1D}B#h|Dg?vGU?8S zgXXYv9@&XK&EU!w7h9M_x3u=8@Fqas&aaFPhLW3a`C)5RmEhoxWME?Q&p(nlz9l_R z0qxS}$2^rDNxzhSMV;;DyB|q^gNuDxdYK?T>E9(-+=tT7rC+)TYnC+W=>zFpI!AeH z^lv9Ul-`#38(is(p4ZNa`vrSh6?-BlGj@Y-1 zFj|mSL<`{6Z8Y2zssBK_E1gJRu(^HL)SxyX25(F6y8K%L*#Ap9mOf|0kJM9X?ffyBgWXE3>%hieH)8rCw(aWH|eqT9m>=bj(K0gL473s zSbBf36Vt_iEiI)tqz6QOiT65_&P2>J={FjdMMpA|kq;E?VJ_{JcwavI0sbQqg zR(fChFVa?elc;G1rE|;H{XqJ@^iyv}O&j%n>Aw-sA7Onmo;;AFghDGbeEkC^IJOMzuWY6E1|)hsqJWl@$JG34bWoHw z_2}lyYVHsP<_W%b#aeWx_9M!ZD z>L``{OnQ6JNt}%Px1|+F65O++#dg??C(=)(UyRaNMd5sI%~QeUn-WA{0R`+)z@WVZ z)SMppl-ih34p4;H?a@v0Au4=5FdmL(Mj{Jn0}a3y?JM_clW z5!f33HMhWj#xLw2ptMszGOYilByjJbX*juizk8h89dKuaVzsry#C{z4bTAh#r5{=M zV}!z)F-k55)tq#3N>9Is0pEbZ7ktZ_AYgd$hS26GP#VReM;Rs2q^?KnNDaYCs9S|e zDy^k=!n;Jp=SKcBh5ioXJN=)!Rt+|vf{~FtIoI*Cx!bQKCyt9Jh zB{i_&;=RY_2`YrFLbQ@DC^e19|9$D)=ITTw7y0JeX8I04rFDqr$XtCCsO!^-bl6H# z@hrKGh7K}q^f!z&(Ww8n3jp8J#>+Lp+>_h%e}f3W%L$o5YKz-k-lm0H7Vn3aKGdvJ zhb>%=8m|;UU7`Q#Z{Zz4;MHK~?K6PfxYxlmX~?-M%gcg9-`fDmXgsm1TWhC21p-?RoH#RMzJIzFzcHhx*FYc)`h3G?8e*6&4qISa-Hc}%f;>J#T(Lu+*ZpsYIob6WqMM=h&r-h zF9*fg8D?C>x7Oy9h25z7UOL0|)&mwtZAJZPs#;mebILd7x0#+pExfR(KeNBM2%n|@ z11}S6(cHFqS|dCS{=~0FE5rCF()Miwpstm;o_D41+~SY zQ2>aoKjrExP{sW_G=%)Y;0~`Vi+XbGN$&+3qRTrxPjK=pJm@uZsHLV z2Gs~(X8mQ%t-b3Gn;Jyl%4UiLrMNctvVDdFb}3}H|H7{&!g#M=OZaiI#xDMZuLv{T zqGv05DmwPUF%*r>p<#_Xmqg7jzt~7MLGbqQ;*DYQr$#&|tuOW5jx+pZ5$?v8;MY~* zf0q9L1}i`81K0tP*BmnE&f?hE0GS0u`%cJ?c>O8RZ3I!V@1iMW zX2-`rD-QY%g!Zp>@o#)(LA-vB_YEq%VCSTFuBn9Vi6kl0dMZ66Nj)P8D@o2`#760d zdS%x7-BK@0_pHE5QxIAM1Z_vdmw(~UKWo|kDc1+uxTTxn!ssv^rTpkiy!KLHaQ6dz zDoxuPc*>9Xt{xLNynoI*msG9?S9R-`{#=AU*KDQgNvhQDQ8@M2@A0xv#7vD zdiYOAWKvRYs;y8EL#g81=Yyf=`|iQ8Euo-VT?(9VmZpwa41Pqbop_j@c^{`^JB3p90f;Mf=1N zUNK_G5Yv_SV#+FubAt5-EU)8BYw)9($#C{;?JiEVgnT&$A9z-|5ir2 zy#@Tl&u^TC-FDf(^2Jd%i0%VdK~qiY!n#myJ`F2B zGi+#wGYCghfCbHd%?#`rzwyaZ{Fjt}fG3}DvD2ZFCtX;3Xh8+_deDp+RDM~{;R+U& zvuH|Uqh2HA1lGiom5|Sc!m}3xTf@J0hmEKu zfl3R9H)@LwMXvlXgUYOdKJce&+{hVrX=}&LH2>-<{4aU^vo7dCK*A3%aeozAY)yTk zpwQU+>QP-*JUiw8yY z_RxSHUYy^4@w4>5h4fon;<++ubqu}C21gRK6bAz{ZCR8JyI^W&sy9q6IhUSmi$MH@c$I!^{mC)cLFD;mhiDPdOHCU&dn7UFv<2XbX1Bv zqNC{8Yw+wB6m&{_brugDbg|ZsHLch>9+7=8vG8=Z&|!DWO3tJ{$5X#Je@A2Pgt zs8ln0anz2Y^)rYJL8TwLxPv0MHReIlZ}**;dHWDhXCFrvknLAs^-HWrv@>Z)NYYzj9|I?rJ(Eg3%&shZ7sZu>b7%(Ud)ta^E}P zKU?mcVz7&66yXEWm?8oZ-^N?iM8hK@EGc>v))Ue4Es@#Zx_M&p-1Ox(i)uT%cpase zwP|Zih7m@4Lu@*=b6#(r>Jc+0D$tq+wC;-7YC;mccXH|Ca~i9{%Wk2c>`;fo_*&0` zfJz(QoSSHb@ax;Gz(9q?<_nwGK3o4wsK;@!j1jlPa`hmry#okcZ|OOjl9BG{EnSeT zuAR*u#pjC7ml_?;NypMkob;t~lRsDe0f|Rs+iOFM!U3&rn};3(Jd<9LUM1x9md=bL z`F<(AbJ&ocibyX>&s#57V?s+zftW_n4(@y^y~rGfBP*+BXz!Kuq4W*}hv5czDm`U# z$~Db^fggnATAL4X6U6Uq#MYL-r1UGglR66ZCq9rg6%otc$?qsr0V&4zo&o7avQnO0P@zV2-S< zn3VeZ-k08%o{W}33jSWAIV(F!U_zgGji&FZ{3+i`PZ$fc;qL{mw`3^M9t8@j(GgI? z4pl!g@FBkOitX?$`M)N;C>`THoxt58|F@;511Xf!ed$f<6;eU*!X3%^N79GVhll!~ z62#Y}*BAqsqbenHmDAkK5&6rwXchTRVFPV*{>*rgUL$13me?4I7G#MfUG<$|K|LZ> zYj!^chmYv@n3#bJdwgpRtd#CauSj=6DjIWXHG{}Le1-{mReF^GYz?$Kg(C3i8OH0b z^adB_JaD+4E9rgdBMx&p=md_?U}daXS@TW>H6;Z~SH6Y?aPfr2oY+X}y5G~Sf%VQY zD^dQ6fdtOM+gcnwPJ>H8$Y-QKmVQ@ynNb28n{mCCK9v4O`rqm2?MeTO($}OJ>4CkCi}UiwSvE7B2d)K}76 zdd^6$-Ov;t? zed+&@zBg#Y>>y0trxcjJJIqMS7_beius1*f`pZte?F{G4ZIJX4Ec?#y+_6qkwT~y- z89(Pw>vLym1vKM6EqKTj{`B+NV3 zh8^ioB|V_T8v)cl2+kNZcuaf}CC>CxCtyK2Ow`lcYLFrSKapNTVVeF4JWJ_q>2Ia~ z*6(k+k^fuLH>KZ^?n1-P8Euv`&E#*T{~-NjApbMz%hF#+zs;$#J3@yY(*u6Y%(;CV zetGx~=|?-Zd$b{LvA&lyV(goF!+Moqm^HzDQt}g!%#L;ec?4SaQ-c^iR#WL;8K`YtnB?4-A8y+oAAJrK$9e^j@G?X>s3@{zCc_hDe;@04{B|`C58Sx{yAK zL@eQ!pO*ea`g5$$%(y%)8{lK9mEMs)I84Da!;eJy+v|O;oflROPzY}u%Pr?VHeJn( z9=C6W?<`Udx8Tf`4p1zzxp4=OMEH5>Po@7!`kYitA4{*ZU`OEWiBIZ}VjRnY~dHeZ&C3g&E0l{ZAtNL+MYX&*8~h zEXgJ2HN})Ek6MKaA4#8;{zUpy>1D8-jHcl!kG+&$kRC}Ng#45Ke@^-f>CYIXwd5yJLv^;Bk}MF0)1ZU46?eB|FQI@^bP4xrO)H8VlY648@eYo z(!0`!1NpxueO3B~^f{Ztv?Pg7e)A3K4XN=vmxFgGkYQ_$>m8_a#*CIwkNpPPQ07e- z;F2I{LuYl8TY^w7B8cxW(V?s2b*ENAZH=fsTH?LY{%=TsB)wv9F=H}k#8hWYy`9Im zOr+POZ%Th6eU^ck!5Nss5X?}(&r5$VeVm?mD!nTG5rNk!uvG#0&>*}!((}@DhiB48 zQD|W&fE?laf?BdMn5T{nkiz%7lrL+u(v@L$H8H_vFwnCza5fB8$NC>zHLNi7a+rZl zr8lHMm;Rmf8R^tacR27XzVxp2fPZZzRnl|P7p1RAZ!&WyV*^wKQIdAji_)voiS#sG zJdr*n{h9O^(&s5rbZmn&rdu_rP`db|fdj6oo9wI)qA)!V>oYw>!J?FY&M?B*w;L)k zQC2Pn7Z;Xzd-6V+s7gQm(A}Kb{dV4#eyCE5&++$K^PItpJdjK0okT(&(Zy~;NhrSZonlM(mm)JdC31o>9?h~r2Cla6HAs#V4WAGj3nTr z5eE#_hZs7~tSRX9ve6wj9=KKOW_L;If5}Po`1%8tc7$pY-)fh-M`pcR-ap`+dT=a=i*RGph7*@46*^9gg`~l;!YvT_5-%gX z-$Z)A9`+bugoT$P$S+9uj5jKtD?wNbQ2OYSU?>?+ZAX>z^N6Xv{;}E$}mGRXgG%> zLt8fnKRlE^8l7d7GXeLvr2D)}fE!bmqhnJXk?!o}|CaO(=_>#aE%@h@#3Mxyccl9q zd4KcR34ZvR^m!?hK4$%#udrjMOa{@*P=*_}Kq8JU004jhNklTdm13?VgUO|Z^s@8{2U$Ngf1P710?eO{Vz?O&X)Qgm zcqPYqPo&REza_nBoyf@(UNgXd&YvSJ{LSt;lirYCWZq^1N{$jVB$+mDAT7%HxTQ_j>8+u ze=0pEeM)*sx?>Zyf|N3`RJ1micC#fvBfSZECbv1Ft?^bH5Yc@TNs|9(rO!!c7`mLq zF=dczI2~7P^KlkU%f$^KWd&}&#Smq0Hl(nmaEBUieGMuk zv?D-RfWeoRR;`{fNT&cHd!Jnu_1=cBZZUlg5p++GHKVTz+~^jKSy&yeIb6H}`W*4n zQud|p`#3f|-t&}Ra`sJ2VA1l-M@FyfY2KW_99g{e zSb8Kq4=E+Gv2h-K#4r`BxTAWI)INgvpzHVKH{n|xMGIo5D5*4qDJki(&j5XM{w!pz68jb@^ga=^=C(qaO{MGb?Q=`6 zhEjM=dI4;6VW8sN4r|Xi5~7#KmHb9yiN^vgJs!9vLc9ob7TDeykI!5)>ov6I91?F?Dp)@Lwyos1Y;Cws;Q zjx4-p@WI@$gkW;UAdp@iuH>+f=$5MaA_t(a;U6@9ePJc9iRpS}$zL#t>(O403{QVV zG}*F3jWLjqZDv~@R|@X)2?Cp$iE8=krr*t>FaQ$_U27S7XKg1n1l((n9SFlJmpRskDf(w090}XibxLg1~#;KOUii=)2{&yMy5>B7l+u*%_9( z#N!nf2E=T_4N5t5z|pJ~9Zb39F~Qsmg~SYJp&SHUf7^xYdq*!%No<6LkA$1%IUgMtPdJr1;h2~(1&+qfCc=x$MpO($auvP z?jg7io0OivfIBUY&pENVz@Z}}H*Ywx@HJBS%*qf`i&NH?mhC^Kb+dR=-ptUV6h^EL z?!SN-gsy;vhHB`gsBF|KQ6IZ5xc!#S=&-olMez?h4Ek7DMU zSD~k+z1Ny>VF##uZ%{rz+{HNoY~CN3R)MbO*u{Mjd&&@(5O~F&jbuH5u$f_0H$tk& zx?|5xtolp$9u6eC|-7C-74Xa?S}i>(zk@7*}(f z79FMi+;EAF&GZ{?;0@s11Ae2RJ#$HLKPRcn5BFfkLhty-sYPRj$+qzHC+(hd=^g1V z7Jp_#jXM@iWNyu<0}ET(S)>(NS6udp68p6;WBlrdTv+y9sqj_xxt+Z?Pq_Ipf|=k? zat>1368oGUjsXk#+t$T+8T8_N;0yN!d`@p6^CT_ z!yg<) zVKz>9mL7%3rLbd!R6$AN6kEFE`E!WLCqB@|JEcn0E`ZpXS)S}L0FK0`=02sZ{Ep~3 zl_#GJI?;O$U}vNFIvyqgDN*&v?QV^K&9U@AIPZK+_llUVVOjFMm!u^RcvsVF{^a)>)ezLJ7wD5`Ap#>|dKY=8d%x;+5ETNr+bvI(W;M|2{UUiMT$0`p@ z5&bEx*K!JpMPkH|6IV{;fIb#qvgU~5-yTV8bZQT(GhEb^p*Sqp> zta~`4TpB4;c;DD;H>=pvBXq$IX)J&&i9~{Y)=Od|Af1x_%KDB2IgZf#S~~YKWX}Nv z6*)LR!GS7_E?x{q5vXs&FZqfiN*=-KU-n_d{qE}s4cuH;HhR5fg|^S2@fBvJH@K0` zC>##f^lV_u%?mU})>;)QLq;5YZ)BxgK2a}?Qhm7Z+&WUC z*1O+;4Zy^NY_aT@0Xnm1-CzkaAjX}EBtymKfEPhz7u;0yoc%7}!ZC(jyAe z`)$}+)w_lT01QF%zO^&bVMiW!YMb=tkqX+?=Xk$4ng7P<$u%Xd5f!R9%60+`v9Ua0 zN%CIV`t@R%+?G0L!H=gArT;&1??-5 zPdJ%)ix$P3?&IKqKwSzQPrmC9I4U`+wkJ8^D`ptyjvCa&$TvZk7?FQ#T*is*h47WR zIo-9Lp0&3>4Nhl8HZr*FmF@r5pFb!}SJe6{Q~uJj)a${a_NBi-s*ybC*T01VZP^$T zBLm(%fF~R)NT1K)oF>Gc0fF>n7(vyHZ0{JomzkRdI=J!NBOYObx`f@WtfxHi&?UnG z7Cs1RZ6|ugROVHAz699h4&Xdh}%E)Z@R-m#q ztBvH;K}stjNN%MIi-B&u@7g+@ye1z7WdR-@Gohf*0*dHh$$F#O9PK1QJEd0utF;~L`Ic?7_on^na_G0Hn20- z6;H9D{9Cb+1AM#~RNq_$W^DdoJ%j_=63mqR@}4gi*+I)t@F~N?R+h2`^Y7T_BZvOf zHmRaAa}ufigd*iVhYyAHzP1w^=tcKgJ`Om!%i!$pK zUM}vi;)!;4x>2Nq!-t=c_@pfI4&PE@E2G3C62!d};;EL^^L;#f8)P;tW^MZ}FV>HjO4j2WxmS7@m@SI(g9Mc_X=WWpH*X2u+xZo?a zq!NPdx<0g?f|ysLDo$Pt{4rd8hNj;^W^Picx4?g*e{-fu6$njASQJ`MPNC~|3V z;#d(l%y4Jkb%x&LfESJe^z!%I*xVTbagJcRKBJ>oK`yL?Xu`t<=pQ{R?PwR7o8g9s zY0VW%wSG0P02PYfW6K!Nu3$6ktC+0(pU6l zxq-!$VV<%>8fB!A=aPtkGE?1}qpdcBC(n>-!>qa^dcNAeC&sfk7LPU0kbijZfG`T2cFFGAkp2y8 zV3}cNJLESv8s_te;N1EM_6&s5o?>g@UyzT&u}w@_dY_hmo^Y!QYY}*UMJl%&%rq&e zX#|!tV4Kwdq}Aap`H0c4jS-Zy9Z%u)r005a+kr@NS;0owkvJu>EiCHa4Y*u6Tlg4l z>rnfZb)&R+tF_zkWYa$+$5>!_XS_wtJw{w_IS@k-Uo$gf6E4~S@<%ke_h4a3Txn(O zgdnP)-WG(3AaaK!=R9;k>Z{T4UuGF*Y?NsJ%IUTYLU)e_8b#DgMkabXdTdT~hxYWs z=P469?z8XWK?+Jv(F|CR?%|H!?JaOpV_kUBdr=UgtOhWC2*0v8I!G03R(D1bdNQzq zIWo)XcwI9Ut+rRFZwo)edo7I*S-=6kCY6~I-w&q&IC2!AZD!Qf^p_Q!f-~n}cKGlK zW%`W#y~ZkCFnzrqRi2FdTo9?Z#&$VkEn1?~r-MV`GmC=*;tnF(6(M%;G4}ybW)a60 zGhACVx?``108z%H{!(&MssGf*(#<~oDEQ@V4+nO zIJfK;aof(+bmaL{i|b?QIpn{vIKD8yc};NJi^Ds}v>jBxYhugN{87AP8xE{8D;<3R zL5xkl?g;#PefaRQ3p`2<7nL0Cxio?6KN|!fp%Otyk~L1$c2L(t>4{ZBZtk2=>F41H zw&yBfTS6`ea*f-wR6`BHej&vkBnTufBO!6 zJ<2?_5wkiTWNY4R_-QD6UrQI5xTy`Us8Gh8@%i?j@M#m=XJP1V0m!SvS+a$Jj-4e@ z8Fq64dcC#Cc$~#i;V8m-%!r+~{BYx~;PLV`R%QTX6wvR(NG|;Oqkxk$VyePc6ok)L z=wmjzd*cz-;Fc}FUE5JY6JG5C^Hc=}>VA6xiY8X5V}YX3Js)%-O1dk(%1@sFwAZZY z47;(p?KmZ&;aAr6vBY&H$=#z4yB;`z&XlA<$f41>`Q|MrOzb^rNp#qer&_pNXsyN5lTC{RWfm<@&a0noT%`=nI#@Uq!9`} z!lKEkTV0@n*R-EhHp^>9D(VH7eeQP5-6H%=X7CB?=VYy$Pr(OgCJ-V(cY6uUBm`){O z;CeLHGQJ}>%4(DhGdydQNCGPBdEf?wL=4%fShs6la*HNLRbbBok^wU~E$B#a+d%|3rvf_V0vWCt6r3aDgAOyW;BiOocrAS(J%!|Z zY?DdyfeMFK1@2vPPZ55X7Jd#nu%(UOn^>=Fa9~T;Vs{7t$0}Y*AJP(i#8)q2cutI9 zQXcAmYiUMiMBCUAOY+llpu#O-nS)d%_Bc4{t?l%o{-8jIm?`OdXG!j zVG~a=JCT02G$N+vJ=t*4p7JH|BvlEHiB+y_u2Y2H*P}Iz+(4FqjXNIXu91%}X>;3u zOMt90+gc#P3`0~~C8dbysbB=qi50!Ne(5q=39b<78G zWW%GEMnH<~Rw01Ec6-{_Aj6WSUs>h5Cb(&>(moPFi+63LbLO(vcyp0vM?dl2hI{%@ z80)`9=_7rJw4&#LTDv&1p~!aEq-o3BhUsS%+tNZkl^PgZIUArfTorEEJTw5YOD-+(l+{sNO3_hYE)sb*;BrX z95;&nQzLT+sW84}sQ<}&pONoocB*P?J-C7h)Ki`am!?(;$OwaSD?RRDa%P9Z3#4I6 zv3epsuopQ3Y9HCc@t&b=TN^xj%81&a#P*&)$TDF_EB4IJ!oW(pBfTR1%w1oHe@{P} zux$diTUj1BGxsJPj)D;TX@=^AR;;9lWE%GeY|@-ILigZ7oG(bBC){&b0bE<_Y=lw+ z(0#!s*KK70Nfj{IId?A2VntXx^D;y%#ZvkZl17jhk~2E)&)FK`pk3oVokn_!8Qk_O zgCj_E#n8E0dI-}$2oC|Hc9!Gq!_NuUJ6acOJCdNX^GO>k<8-m?TVtHXiv?$7!>O9V z793$Qg-zVTnA|X8HGUy@9rMAr6c*Z){Sr+_aJynfhr2H~(-UsJwqsqRGS?YYJMsyH zC8&3&NPdDuI_nJSMW87zo!LAzfgE%OzO-1Fd=P!qmh5B*qOf8>ZAtV{{v8qg3(Vmy zjOe+r($~c5!@Y;U?tgvYysZH_0)p!m0CFpqN4s>~JgjfVb{>aNT#GWaI52_Iey2JV zL6A^_rxO+6jJji3w(iK7fEi5>Cg6EMfwTzX6qQ2Ll}o;*}iq@Iq=R z7EP?2-=gO|9S8_1M?S&33r5I^)WN_X;c<8T^aN*HQ@(NNGahH=iBLtge?ipOnNN;9 zCO1c)5AK|7p zx6D@Il5oQ*ea=s)1~r`ciqrym59?i6HZx}g&?Q;)_7I3^;N9M#3{%)xhn;C*V^6&| zKn-c`%;2o31Wj=!;nj!ApVIF;(gPxaH4xx}usD#mwVScbk~JAlwKY~@YdfOw0$Su4 zI|jXpI(%(WWs8Sij~dX9t-dwbB(r!uT6}WWwHHLk`Y-Wsb4iG!vqek*vhK@0uI|+g`u@nelJ2tVOy492P-rPHB%IHPa>FrmWS9gGg!f) z4r*jie}*Vd;MqG$$l^6teHgU&47rhjCzL5mOZ6+;3CqDSrqiIs~id|_TexAkGm zjC;x3uc2`Uv2cz8Z3c;468!_|MSdw7HKQJK&Oa4{EUG1N%BbN=kK1%fMzySpPpfDEJlvJ=|wPQKw zhmCAmKQGBa5uY%nacc2##!B@nz#Xd5gQ>Y1Mm zfd%a-bY{b&B|o$DATrp4m(~+fgUgl13eVUMjql%RA4!{Z!I6kjtlYArCWDJZgiGlo z=_5POBjg$p)7DIK?-L-x(t0t2g3}U;2duF7L7jQoE974o8$^~t?nui=Y8W|h5$R)R zXm`XwQDMpkF=UiJGxk{omK|#}!4N%hGHY(A+Lpv|ZKG(z=5<682M|!ltf@!R^A!Fz z7N0~Mc~>e47xywueOi~&E45~_zk@Fxq%l~MyoAs8|k(bJ~M17`RFsk z&r9i?gUBLK?78w9friD(sMpT0P_Yub0f{>5H48$?BZ>}bSF9-RJz?DBNCdQ$9x+0D zVkZM{EQ>tCLvAcw+J{eplINz_5d>$%l#>AfykXAuk@SE@u9{Ty%*;qVsD1kL$0J16 zc5J%6qffJd@6)5B#k)`?GY$U#H1M-xf+c?GqWxGmJ&KZ2VEeI`_Mc>;-EX|O5*=?`QS0PW88r)I0)XOnaUQ`0^dXO0yaA;QIPm$2A6 zGLYQ37g1H&Z=@grbUaA7n-=U~PGRDTX(rpj{t2pOu)r>igt9jJN{>cd#sZ|CrJ9Yt zF*-*OBLi4lA@D9;d?Ecr`lr6MHr&ZNz(>@mH<~9Mll$ z#;V3KNi3cvMA%rl?nZtS%dQH*_#@*cic^LHg4i3-_NizbQi zr_%SNyQBig45klRR*0ro2}$73FQi{eGw2u@;;4+^vovgezj*25cSv8wW)8io%{*)l z3s7QXg<@qmSo})aq-#qjFgyPJY{mDbML|8o@LuPSxYXtIW2xu(&yS@!V zNm~8B^q%y2LjWS;S{cPYWYmTTSJH>lBaS7`V0yHcM@8?;#?#LfQU6kUBE8D0wl?{` zv1mMknPCPJ5q$#s(tqZup9+os+u5ANBeGYM{!34ZG<-XCT)@ z%d^h_*3x^@Po!7rIS>P=Cb-W4psw8vx6+mLQ|a%em!;1WM|60Mg_*>fW8-@H|Iehq zlI}=frcFM|;W=%Tkxq}WIAr+I)`pR7aYilBRFdj&0a_B{YpWWjBF@Buy%d3`c8TQx zhA!dWMlVTJqaqi4PHLo=3@O;(|4RCS^q-_-)cA;pTo`!pNO~%L?Bi6D4f&3AS1P36 z!|&DT${f+utipZ|C;7i4{X}}raHh=c{+7nd-64f2^ry8N^MVX0zmDUB0a|(hSDlI=`-VrO60m%fy7ljg}a|wjGP%dxq(Dq9YSy&&8$C@z9_w9 z2NMMUFL0gua2DCM^b_g7NzX}NqDmQEXhDq|BtiSl^8@K`rBmr;l;i{;Sfn*uL-@Cc z`d`CTnG?X4a26+~Ji#d1C&7XL6h`h2C0!bks6gzMBPhX?aSD{IeJcTcI|8^1$B*_k zfUWe7^dF^M`eW%HWU`mdjAc0SM?Q}}UHpOc-=tEyl)i%Fu5BF6ntse*N$)Yd>*nHL zN&f-g`*|#WO>np)g}Rd7k>2-Fit%Y!?uq%q9kwM0nh?&t9W-JHaaS-&ucb!}7@1fp z`$P9ZQF@qCZn&ntkYrn1LA7MC-Uy9#E4?TEwRBIK5sgH>EiL9#={@O*^h@a-9|*CL z-j)7ZI^}TI38uTFrj?BG6X_SyyTo)i7ynB7JL#Tu2Q|*nt%$eN`h6sQa5#!0q%i4c zKqWgaBlawfXWRMpkq8$yh$%DN*%AIdevuJkJkL=;k-Q&s=PBUdtu0_l{vS!-Vkq;Q z77AV&D4N2@pGj}iTYMwJUrPU%F@)|J-#udD3CnTrC`eko|4N>JjLf&Zf68Xk;{Qx~ zmu}^u58K(ar{9~NROKATq(mgPev?ZQO-;&<((Wv2s*V1F@q4Ww; zSwLpIAOiTI^fTAtPP&qQLVET#vm-O$fRwGCOCLx-kiJJuF#MF%Xo61{hN3eas*RPn z{}hB-PYIs_^eyq@!C#tq5YrMJWWd$IV6N??>J>X=br?a5Q2cG_KTDnTmh?hDAkL)= z>0Rj~>F3f9-2kPFeXp({>1WcnrSA+bzLtJ0<u!`*awj-YXH77jzJ<2POPO@ zgBo59PwNHCG;1qs2YYuxY}Jg)e<}T3`j1j6Ju)j06^G=QK9c@M`iVP$jtnm4$Zvtw z9|K`UHF+z29})IYxbH|;($A&OOP{ux<6DbL9!uYrekfgz2B2$-Imvrml1Ht%+dbB` zG6BoY_{#X4B9+du*qJd=d-}hlZknpYr_%GrbM%E6hF(2aoYeS-(iK$r(&nKBesV3n zC;e)G1Sg2^m(p5#7dRpig$6IxNDrl74z^*-9=MjiO|a;6mW)#-5>B{2T!6xkc24P7 zbkWqNl3BlLe4heOQSlv4WVY?JY{TkCVo%4 zkp9l*O4k^`hDrL5q=ytXdW-i1X(@f1O2EckU#bj!B)u3N0Gq_k-oQVaVt-074wk1a>7o5MaN0I}2p4hnip!V!o$aCqjD^C<9gB+OiMv*u* zP++1nr8>;O;thW$T}s~p*C>H7rX=#0(nr!egH=rOPchJsXyA-Z>NS*)rC$yP%O}hH zUFnJRZKknA=o0-)j|s1~gD1~y@InL5l99(=+gW&57OwBDk9Qz6(v?+zqHsS&|94(M z%TSLrd*a9_);qo^i0aW_P(MTelL~(e3RiA2+VONBNbim6{|}`n(mzV4hGOk(|E#5t zq+bjcFX{h}q=(XvINT{fWnr$v@~uXnU)Dx_1`3#AykcFVKce^WX;8qqFx3KQ`N?294_EWdQW<2*Jq~2t+}h_ z@XlA#+tPamP6oT7wKQvlM;uxfXw`yh@)|R8Z6W&3k?ag`vm)4?nJ35rd5;;E(_1<( z)&!L|!`1}DY{gR5w`Bls36|2^mbQ0B9c=;QcZX6))yN-7zqImG2uQ}p9>4gKhJYX< z46I}grE&y--q5YJGD{p>kPTIVfGILZaSDt>ZGerCXiLtXzAm_JdmT=wLgE4)LzypO z?6m}obB-JzcK~9SZ={b5!fA=1GLyo%3ZF}B>6ga0_NrSL^YM!>rJqZ`vR+XSf4vwp zUId<7_2&+He$Cy<=}}UOVCUR(V*{#kYtr6-N?y3a?v}`@whNXVE>qkl z|AylN-afER0hKqmNg=trcchQ{?Z%zye~k)`$bZSS>%s^Lo$c|OCAbxSxDRDz0$XXz z>U4JI?2v_3TIdw7?GSKb1^qJ&(aq+Wfhy-b_Jr+pfeE~nUSt+hK6vbrHCP*-HCU+5 z{)}&+h!H#j%1n*BXIQ^y+&Oe$Jbw?aVeEC`bfR&*g9&+Ycqmhj^*%w+i9LFUq<&{n zrUq9=;il%EYg|um&|gGJliN1HKngCx!zK82FCHw=Z?Jg1SADkp6F7b@J(2Fvl;Xg2 zZfx75!-0~y6|{CnolAw4_rWFafXn3m*c<7w9S60r@GE_9@@+5M46$($$cUx`5e5)- zs}w|mbUh~gkY)Sq#oLZurcc~?BE$%|J~Mh`Nlus0$qP4t;fmLUehVY@B)>gn@ybC4 zl^({c2a$DV06qoT=mrN;==P6`@KFBxMEkQd-n1`U{Ac@#7weYIck7)a13RLjqO+7U zvt4jRS%1i5m~sYaX8vJItePyb z3spSOWCDU^%onWEY(2=6n zc=IXD+*C%VThdXM;o&Dhs6|Ae6B;9Tb|&sAKiq)9w}vJDx+46|B<~luzs|4zI*XV6 znzs$LFKh;7tZ2!>pfe_O3JJ9jp$BFu=aJEowUe@(gUbTAR50YaD=(AM-%H{TiOa36v02yKo5+GU~G zmXlEz)+Ura+X4LFB*lN$l{vBFF{KlMc-F)o6Uy4{pcg871cIBfWz9leTabns|Jl3w zjOadC<543lr5E_#N7ChOb>M$_5&pA2zp?&rG0uT|=kQ^L+7(E#a|o!zdgatR*KCT0 zQ1*%RlqU6k8-19rWS{H8Kx_R|k$Lq3g#y0ssU&=K;HGN^q&yD`ru%0`iT`7MT2%jp zz$GUSSYnu_94{!s%{`DN8>6+sQQwn<>qw^K#H@|t;sn6Hz==vr;J67aqM#xKANSwX zC(LkUM=L_{)}}a5nQ5`~8eV7VLkeqRCdnyLXR5vLq%%XyJK;KN`=WTFXhe%4B5AdU zWwd_-pWrud0S__`fKfk-wZ|gzi9y`pfyJptS4J>&T%|P6sLx-6Ak`Xn42}yk2;{x_MMGR+~4_# z@5-%pR5BHxuBvQNQT?XW@G*L%E11+d;jjiO7&jH9l zK4z5J*0kE3NoRCOJ^^`nWL554^{&?_YmXLg=vh(-dO>b#91&RajElv}x%~b2p0)C^ zazXxStfxJS4)QW7MApgAo!?@YAOvpfmy8|;GTi{uX=OC?HzPS0w z#Vr9)1AE>ZdrR~$rg8L&92vmS&K{)VvI0x~6l7uML41I(6YHGYvejY|z#RI^#h{IQ zD1ZqA3Gc$BjP}Ns%G&whYy3%4=6J^jI_yX|6E^&Ga3E>OU?$y_?i0$yVg$m!;?%Dp zhTAcZ?%b#fQ*xn{S0xWxOD9Gq-9r!=txBghPG&<*rm+R(|%QTb1qCz!yeBjXc?8@z(*ICAkR!{B1>UyULz(Xq=>{civtMECnO zH>nKH)ZuY*ZoO)9hotj&sdxpcYipwvZXTf~qMh2vhz0TH72wkxV^$4reyGA{Fk+&1 zTvz~eeVZ1>;@xA2$HwRzEl+wen57je9L8(PcTFr34Eyusa1w#rxhv@oaByp+&k*Wl zKm_b8BWtN)#hz=-=^o>ldk(BHtUso>rLQ}|heRueEQnBPU>35~KXrzh-sA&Lw zEWImzS9EZ8v-`#h0;D1BS{;E+gDvQf71a;AKBiB>usF--KL7pb!G zVjba2$4rq%de11iHxS$ru~&lu4E(KPKg4{5>1{qZMEG0MXZX?xCMME3Lr~t4e#lIa z-sX8t`l9qARFBkkuwe;OZR2g}@1%DJ@}EjCOTQ((DLqdY;9?g%(HYXPZFkHJ{J!)<>G7fQdQtkK z^afe`9A}f%6U)E^D|gFNbYx@0 zCZ=c?Us^Dgq4qQLO-Gh~9b^DW2CqneApL>#1?deYFx2LU(x8D?;L}~iHzoA%8`57$ z52RdrnP7X#|E2*>&r`J6kJFh-uS$O^{kilKZ7cD(m(s_y8U37GsAof?t{MSG1mr7p zpoWWZ#6cxJHh!e1>ppij;?)&migucP5o}AEk@s`E#@v74rpr+lO344~(l?|xNxAM( zc>9pD(pLH|orAqx?w<5z>5rw?aR3vT^$i8-r_zro>+O4KCen-2A4z{Iy++|`W)+hy zDb@>iGD_Fd(#(({t&OZDFTkEQ!G-`8$` z?n-Y-e=hxj^a71oF}NiKfbU7i094K4;{642Z*7=aiAFCOqVnjl5v?OZ4A?Itj9KG` zCpMfT9|WKi0{j@d7oq3Q{&NPpy`MzT8d4E!zTgDphWo5pwq6Zv?*Izvp7d4e|0R8q zTqsC)H4k=IdRf{^Kb79bq^pqbNMDftSo#{l(gN1-oY``hL=!(`97mHbejxpx^k>rN zk=p|tV*0EYq;bmN>FYr+O965bLXhXgamJXc(QOs&bftgwWMt;w$Tc{P&Cxv z1h@-3G4Y6hi?JOoNl1*w843Rj>Av(;>EBCVmhND87wpXm9SrH0e=PkNOMK%oKP&yA z^bP5Gn+p?=d;)0hN}cqY^d0GG=)fK6i_#xUe<+<%^r%pQ9M~z%#re!&5?5@19Ra}! z`zDHmv;J_5i$Me%T@!24#4G8HDN?0$)UO2W9Y9AYaw!$kU04*Ijc{3l)@+}_0Zk(O zlJrd?&|_l9oL-L_2Uakb=PFeAzVy4&pGv>a0L2*lBMF#($80bT|*L2Jna~L&gO)R(WN9dnkgy4=Z;2lqQTd07*c~dm3VL z2ro#lk@(E0V{Q!&3l!zb9<%o;rRSwTkp4pYqI4G&@Dxxa$#^c^VQ$IK!wQsG|F244 zWgv0Rg5_+U1npf(KlOnQNo{^h`lj@|$T^6WchI6zx-V^|&r07Nj3dkFv5w3zA8P3 zx=-+iTT4xnv>(LCmPme+=*GRitd|GB*Tu1oF zQ|U$NMEY=0#45qP3tN>OyO*H@g&~>kK>96mo0uOiY<$9o5Vw~+R@`>SBZy!prno$@ zuxK(s-bC{IY3b|I@7XZ0HJ(V=&}2sMz$(32#)Kk5Ic zrEf^T#VKwD7VjE>bIw|vGA}9*{eNBhZJbUweE`&f)dv%_!V~1E>WVTC>pyk)0{C#_r7^q~3`INS5VQ>1F9I z=-Z02(g`<6x^>0RNAQziY=*9FEQQM1Au$@}Vo(R#asupv9Y4n(U7>eVe9-X#Jc|@A z;MT^ri_oNkXn06N%!%N?0A}EX?u~VxwRXZlHn=!JpEIMN1oG6d+7-CQe&tW3&q!a! z??lDo6f$43$H)B14>zz3eU#x}!DR^YQQ!qV5Qw%63!IhXw8$cL(j{(o6|g>yeLBHaR=3*<5q z(#-GtItWvYftvyfcLXCb-Du>~!Vf1Xt1=2lO=_VwGFfivlKj$$bI-?(o_7P)0E?bFysNmpKyUSSW0LrYS=0v?;d zV%hKB%+8)|u#(Bv?bu1tRh!$it`Q%Q;Vf&?>xId>vxH*b%I*f}!W6x*yRC3qIUC?C zc2BSzMU1_*E}Rl3k1F`q0Eege`8}}x`FkLPr;g}=yb_TRlxJLk&_#jLdQC1WD0XasW3WQ zBwupUTTgg9SW&rgjtVyO1R!K;JoKRfI_>L!%_|%JEn6eN(vc#3hFLGH`W9G5?Ee5yYOh0Q5Qt;&L}mLo8iEoe zo+wzoCX^YcfuCBYHL+jH`%@aWqoWvt`(IP5oAuL&7Ib-n8}6|XBU$lg?iFeqIgME3 zUL)8N%uf8?+>o;j<2N#eX%7Kaxcu5$l?vR)%pS7E$n!&t_cxutmgv~JV2z+O; zYk2zFYJY)&M)`820-14#1si<_WAl{tjN*T5`mk5yW7FH6HRI-FK(eud`V_Yas>)Evq6V@Rh!VN6tpW500bP|l{oJVCds+}&!GVaXJ(CgpCPjapj6L*%1-f0nBA4lXox2Gm@CA(P^x*Cz)X)7G|+C>tl$FX`jIqw7)DM4iq*$KDUz6ZU8`s{+u!f zZDkv=GjrE~JdJ0N@xmuqn+BA=uvwSasCKWI797|aY3hWM(i#&Kpw@?Q!S?!p#5ph- z-#tU%QO&KGtomf^#Nquap4ep6sL4j}!iMjfy8OK^9^NZM_!TkC5<@ynA39w8#60G; z5n5UhwRT|kdSG-7))%c3b1R50F=Ho8JuPDchDY{3p#+{pJ43M9^FcdBTJ#qgQ#fVc zWPN9w?Eb{VSO0XNI9nhLu~(<4$HXv#-Yl91?U5nIu`wDb1SdA`2QfdYy3FiW4%{*rwK5WVL1Sg=IaF1TWD4XsjK68VUesP!Ym! z7GymUn=O3{#^Rlkb#7RN(6g+sc|8Z3Q{TEzva+#e?qjg5Ep~Uj6~<$TXeO@=`I60H zk)O%b8Gd_@@x8$+o{)-Xa9U1$5LkwPm;zg^S@YbCVrQZm$pA8v<%(;9HYvp3@oPF9 zr=1z~Hsn0D#4BSC#B^{TW?a0n8-Jee9-8zgB1MR`^YF>RgA&tI+fd!k4#4b4;>JI@;eRtb_#=ZWUNFmNHZWryDtK*L z6oJnUA2u@??K#95*v@@I(E{dDZpfLSofU^ZptCWH4aaB9vE(@bW!pzLI;TyS?bg;hA*_ z?eAQ1*h}(^cgY|FmpKQL&u-hlnF&(n)k9AT>w~BVB$SpW%Ll}29WEue_V(cW$W^E& zsXPXw*kS8Wz$Y@Z0;ONPcK}qbZoI2_VOVCJ#8}Ec4cOMbCy8cHZflbZUxj)G%uc4n&Ub zs2hiPk4Pt2_lfhg#gZ4M20aG6>%a~=#tt*qSY^EFqqnWw@XChF?nr_ff`zm$Rez4= z9*7FI$c{%n#ZaB1PZ{2O-22LubJd65SGe@2M7bgIVFuG6>R@g~%F_}SHJowP%cF8T zKr&~GZLn1hlFtvz&u#`5Hpeux@UtYRd2CGw`wSo>0-mC6LFvnQgs|8hO zP6Cu!pK7;w@~iLWfHV;WP^#@%!3pfUr1V#VDME>DaG$KQV#m!8 zZ_cY`kS#_WSUf`TM z9Mzc~V!&YrHgFJ6sN1ZluLq>O^Y%R1cP@xi~*jg%c zF^Zul82CBLEu~>eH9KxCeIQMw=P^&Y%fBP%4tEhYB&<@PgB#Ca3L@Qe@W&m>x3Jym zdv>ITkt!~Cp*YGm@-d$H5p4LO{$!SfBm)*qgGdFVhSTdrbr-#%wdABvwg}`kwXj_s zqM(uNJ7=gc2arRen6eA555GlLAP%%LGMKDCD0p8%t|)Yc@Rmfv0Xr{%GBW$qd;Tp6 zX2w=4tQ%kpczuK?l|Ov=gbcM=vRk|!6xL-Q#)Mltf``!ItG0~QxNnuTVqiRbWX7=g z8Og&3(qnt>0+#r&m#G0HkufwTK#w(net6J$@|y*iSqeABfgD>{7UlS3C$S$Ikf3mQ z;8bAAgpn8~axFZiYhEm8qHAjbRp1gs>zncLwQ&^#E8bd;ngJn=M-Ux~GG~`WFRs*uhxT*7EHX0GFoO;wK>1}GDzb{8L7jBMm81sLS@U*7(ZdZc=x53Ns~^^ z(1#)Qkfdiqa#&DzEl@flkB;ak{LU>7Ao@4Lr>~h8aK{j#AhI;R1`YOIHiNt|-3Wwv z3!Jp)@?`MC&e(%1+>IA|8*KPU{wuS=4S8me*cUi}fH3AhP%nb8rD0AB1E4qN*)vve zufsKFtwsk=*=vQB@(MJe7=8H2Gs94K1OeAL;H^n?FZ?h-H9`u+)$4VgdE(6=f4)y+ zQ-QAv2xY;;N9klU+RU7R!iWpPb4||tg};1DYIMo0fhfJio)6(q zA=;1QZ zAM&SqWICa%Q4?BFwOi9U5&@N54f8(Z?UA&W?qHa%SiPBa4~{|4j86esme}g6K7mkX zfVQK1I6#I-zFTB9Th9VVfNA6>&f^v7FUs^40CI2rW; z1sHxTK|%R?Ly7&AKE>Kxs)V_YrwTJ(nBVPCl4x?>*D&rg>s?5wGTb*RXh$|Harnte zIPc(J7u>nTFvZA{&XuPls7WdsbS4R32p|mIkdiplh|8OIr z3l-t}F@op}i0lRvk%Cc31S!Yl0;tqwYP^1a|I9S!stO2x0c4qUB@7vxCvPd^50QdqX zO5`s{Th>^ZMDR$y`^r)YiTeK?W|c)PyQK@N<@zP1{SgBcEW;xfw}CiaSiTV))NFv^ z=F+M3yph2s?248 z;Qp-~C~u5QkeLCP*%A&T$im!uW1wD1Wiq&pi^Ii_BqTDZpsp?PM;(QQ&4A4741{2R zIukGNqv}gvyv!_4BmrAXmO7)P&N(q?!~qKvmC}z1e_vbpRar0A)E#g*pqd8EV188? z-L?>V7C*b5!15hh?HgRX6;cEms%i~28VExX5b`F#cy?xMty z1Ylj7>)ISP5|;kAnf><;af|!TQIF=Zmq?b22Mw;p=pR`S5#-t)09g?=ZYVJnCbS(9 z)W-1i-U6hJ`h;QJVbU8W8%E$WvA>i91)4$mj%Ye4JQcpNla7q6v0tcEUA!+9VAWTg zhFTHRBsVnT{^N@xMk#5h+5$i>v5*TMb5BH?V3u26J6u9$ULj!6Vfpw9LsuBlvt#7P z8TF-rrba#uIco`u5Ha=Kn9%{o%N01#zMatz3LBr18KmyIc=Mc1|4%^orSv>YomB8_ z)PXsMerBW!p|Bl^?-mv4l_8?inOWywNh^Me=1)ll!Up!j*Nnt3H=#K7RqXKUwg70| zPp}#Qzs_`_wODiiNhap8^s)hgYxaqN5Q0*h4<4bmqDMm_M z5{VXg!VZdUX+cCWP=Jj8PSE8UGK)0H>i|(fuD+1npaso86HG+%(F>k1Ihu29KudXH zNM$~;cIBwIc0BxquWQiuG9u88osibRxyV_Ocj~TO_1d=w2D;T3ut)Ua)i5_5(<0!$I5WwXu0db7d27hif_z} zF}ju>nU?Z7O8B|; zPfd&)+8Fp$jz$o%%2E5hppW1}dO~)Y-IhVbE96GR%0M*>G$eriUIr7TshUG1YUW_f zSlx&t#vNEd%njy#0;^QeF?NKNy=~MH7hPg{Dnd!mzmrBh$HdW|{346rpdQ!k`^Kut zQ~qgln*r@ep=)c4$jnZNeZKWo=&V^J;LVvO{3QTq)R%krZ^#!X_~0OscW~h*=GaE$ zU(&-I{lr^FA+(gDr({+UXYJp-G_zXspF8$dYTsB}lkvzWJ=i_zu9=O3K}sK5gWZS^ zC~yGB?2`*Ar+RY-?>T`5cI3OLkdCCgqzny=(MT@i*9v6WLjf}+l)*!ZP0~silrd+v z=Ho@|ibl3;#*rnpUC|$N(l?F0`!CR+fPflnZ=dsA#UKEwnHlyJ2}j-YlpPry{->;Q z7HYHC!3O)hW*cmv$j0uEII?GhA}%gTWM|xQih4&ut2qqfbE8qNdE8i%3{Bo(%<|D7 zA}4B$$H}D!SjEm>cpQHTddoRN>mb1&BmF34?IY06I@q_ExO9h8;}S)pGaiKICci1> zx-^e-m%(1$q4qUMXhrNIv_HcbN1M)oj_}mpQCEm^e0P<2hmY-p5Zzor_3>| z0cZ4r=!uPkTG%Y8aK@E%kNL3OfQIN=nHhLAv+JX>J?;Qn(u5@daN9QnnQ^p+2C%bN z2n>AFZ-LCvg&kUESs3wj2Ef44+ess>$!8hB4%V1TkgHg4NdY5hy~< z8}$O13J&5FEOKTw=?vF``En`IAS~U!wX-+h@NJ)HzQ5ro;> zh#`dQ6>Ud?8;yOO_7XWoIH2G|d0l^qKluBvtP!m=etd&J*xT?1sRUOpnV38FI&&P$ zIcqT70J$anCFw&+F21ozBhvwl8!2E)FORLUwlfB@Y`V_Mi-5Up=mO|}2Y7I5npBT2 zUKpL|y@>A50NkP22T%u;CQBURmq;qCcd+_wp;E^XS)@S)nR$TMa7s{t4sI z;yPyiNHNfzNOya2Y=KHd9k)gE=jQjHU_*M|FNJ@1jQyGZd222H6Rh%NGyv)>6wC?Y zTf+aGlx2-?8q$a({c~H<^;h?Zz-V8FaO+#4VXZ+u6`|jPb)Ahm& zKm2%brg9E>=hA@W5(qcSD_d?eb^N8J0CG&zeVCvf#NfFNuNsejYHLxS*=a6QD|)O5 z16sSs>nOJQXp>DPd5Qf@=%Ix}c!7Y>zXNzzAcaZdRFH@0rkj4IuKo4BYeFXh^&M zq@N)ZDmQp$!i_cnAsOlLicGg>!-K@0kpiu7ypqljI!jta? zKuV~1yuyi0mT%c>F6M zj3B#5aKABA(d(vYY;Z_Kr{Pa0mQ-I$=cA2`QFm(5*_kn7cF>_pSj{~qYOqygQDbgU zNoQrr^00^)`aB^)&VXvp>|CzOIK(3cs72?qelu)gVN{fq);QyRC2PjrV*PW5{LhUE zD-8aS0yVK3K-6P8L{RYa?l$kWgpGd}^*BZmYU4F+Y*<#|n|EPHl?5qgXY(lI`OCw( z$Wc}(%&ph>>coDo(Sq?JeB=>pROJo@)CK`h3{LL`QmB~{aBLQB+NWW+gV;axe`^LW z$HfL6Xky^eW>CZ1JAi@}zoHa#&)5zD+Lp9QWsXHey>y9)qFbpjsn)F9IEoR4{Gk6= z7UHb|cov5X&>^4P9yN+}oz=l|m|VTD2!9_nkcuT)QX-8#cx{PbZvmH>j(ddsPnb5I z6!0E1-$y>q;wUWG6WD0tPb>&`dX~7Lz*(Aq2~D1$eJy^rhk@3H>}PyY#>R-4u3=Z^ zhc%cEw;E0-ech$Ggr=Y1?jz_hpamz$7I@jpCV)&3(*!nsCUf24x+-&N>QnE|rT49% zKB7Y6;;4b8zQhY-aWr;GPtH+Exx}PYWG2ztm=PK74QLb^qWgHpQyhT>o`b878_;-= zs9R0VL`L^-+6Ws`)cq&7Hu@-3Rph}zOAqo}kAhQUlaJBuWThYClBecRHn$0X12tf2 zuXjaeGDQkoSSEJ{HGl;_m8n`RQfq#;a5I`w?s#5$fmkIcN|GTP8_EzPAcQBeIT8s( zGB*^j0M{%BJ~U&IX4vq`j9QC5U;7Z7SiGE770*9st8Ju9M(pJM*!br4_Fr=ZizNqw z*s%vAM9qxRvVVr;Y0t4$xy59K9S<1%)a@V$>&y()1Rf{+WQ$?UFb~5Bv?DYN#5dYu zB9v>e?ISd{mO&`8 z&|h1>P7le8#b{_oox{ct{zie`&fS1U+FhBsmr*cU1X+^ca zMO*4)J6FS<&0<;*CUt=MM^?DnSy;3;pcP9M>J@CENNaW&v*XcL{;0u-$2-PiZViIW zZJNWd88zrgA;OL@C>ijxLmFR(4NAb+C4qmm!YnYIYi#TGFoTg{0fiMlgKgqv_0F>7 zeGOm(eV04|_c{$bAw;@|(S*@G zW}HR_jGFp4F8jFUj{EH~^o@~Am+ZI{6HM)013e$n{+0Bh^p13j>x$ZAM~J(yqdr41~K;3aJc zf;uw;y;5v6D@?GgrS)bKlh(kIG1H{M2J8k;p1|>Uq(_|LAUonW?8z38 zI+)NmkKii2Ff3!j9GD12cY^{_%P57Xr2h#|-_We`l;J2zE{~oOL02qWZ6zH^m(qvQ z9qBbj9X>R??nZwuEPZztP0Kp1_{Lra)mbP1lHAX^^E+gDh31X0u#yS+K>A2}!H9?x zVwg$4h49Eyj2jd3zVuLf4FYJ}KQ$;4ugc>1{o=hZJ(gaz0%FFRMI-IT$Ps(}>s4cN z^FUFo>nwH@%Trs9b@P04`gy|%Pcawg9C&!bRQ<`o{atci=Y%jn2Ts_bi4z0kM<^T@ z(hsG-;dJSarK>>?JBw*|9s#5P?I+Stq)$U2-QebH;;1!a1$MqRopdF=E&WLP6!Vx0 zOlqicX&Y#~h%J((95A>wTp%a*4&QR-W~`-Nn8TB4Y~QZ?&`wW^L0*|)qT=z^e9?{v z30~Kb#+u~+bLp4TOLk~uWQH@GX(8Q(61a08`i}H7>1Bi`YYW(!KJTO_(xWH>CY2zHECLIVfhrJvf&YS0~@$I2&=qewB0WIP0d1;Z$?Cb$#JYG}`myvWSd1Gc zt4>Lx3v$=T9{Q!7_Y>(y(#z7zRtXW3E3poRY5L8l^mpj`9RLwfEsPDm<+(fegPHNE z8fytHX#)+nnp>e}f9KePr8S`k=`nt?vP5CH04wQj>F+sH;tspw3UYA=&-9A15?c=< z-<7^4eM)-K2#i}hMJHwcD{H#nU;K{r_p|~m4AKj@Jh`Ez^aNKoV*O8~m!t;<05%qN z?jU(4bm@*08G?e);&)b745>}sYs;g0H}}~GBKA2l;{h2Pu<&M|y?v86`^EviOgsA4%`h6@9bvKb8KIbXWR)taoEA zBS8x5q;u(^k8unAEwNrx*rQLS3!GVQ>D!F60ax(|p(hy+Jfj6M$Rl$?@yme$i2!Xv z)AfXU*p7-*yu}={P&SbNM*4;HpQS?j3LsU60Tdz4ns{PQhe{&+h4fd_M0!hl11cvIP#>SvKBtBe}Wp z8pFPW&KC$k*}upBN)c$T*)s*`d9+tAMmgNc3j9*~Yv~2)_b5hO+WretPRf|}F-*Gn z=hAOaVP9Hex1jn6 zFud-^q&*v0V=*#}NsFT82rvOb=hFEgiwb|gA#jeT&#*WAw=bykd!Xav;~3#GUkx^r5R^EqyHgEkfQ(FLA>ZSj5V`N3M1eB1{+m4Gicv zFnzVL%ThM^vGg73r+#sKq5~tM!^ndrt)!2oCsHllBTJol9M%~3Jva?h!(AgSyc~2f zWp=BnRm=i^E71szlF&nc9!Y;^_-g|ev9;ct6=2a&|38-g-uNC3%eJIEv*a|S??^wC zp1S@&lK!3+uGh%`CPbYz<=YRWpK|b4PlYlI{k9m|IR?DMo~~>-f(8`6*b-5z%&dYO zJ#qm7_X5zIb9BOWH5TsVz?SVZ8X;@xiS&<5gZLs1%(0QTX!OU@ccmW>RQQqfElAg& zCm5?RwJR{POX;W5J4Aao%J)?IE^RVP>C8@!i@NJddQbX}^fSQLz5@uGo?!siq!ZB# z6xHR2jEUKA;EkD!)=J>vwL1DyFU&mMEMUjv7|}fCQCqu$G|-<( z|0w;$7r>t+Ncu;qlb%X1*g(?g=Y2$-=bgdD z>CXQ_`apVJdI1zM$_UBseIWfz`mt3k_YWVz_KvD~;^S4;Xt1DQ(!zPnV;cqTS`u|n zP@CF-kPJV&w_aiQkF01Da7c8A&4^}~&mhu#ApNbhk=~bHher{l!$f2KmGnL7`>ur@ z*ZjS-lYRIoHrS!MuLkHcCLv0b1ub5gkz1~Np7xAOySghM$7=WVTZxFc3;z=fVt3>b-l zk_i8b5a)UJe`i$fjr5N6j{^}V`}4l^U!M3vve*92Jx^XpJWMsC0$ED0`TduTT#ZQ|TY+{Fqwo9DS1KG+hsO z-@DR<^!I3Q5Pl-2xR4$KOb@MpT(HG}v}VjpvkIBnh8q|^w~Qb><_<1Q@M)vU+grRy zGq!|06%fp|^ptA$gg|LN3Zn~i@k{AK`UgwdrZ}bmR-Z^uMn(9(w32?pO(GGNwK-jJ zz+3O)rS!J+Q2Gb3(kNF1ynM}(qPxR~Z>-W+SmF`wBLxi|83oza%P0%uP&Nd67aYl1 zm>H=@=ST!+YzMCe2{hQ0!fu11r9q{8n>SJID-)*sMRffhcJo(#gU0D_*v=PV7s z2>#oWGDc5?tzG<&b`G!l#kgxIEF%zT%Oag}Za@y1bOt~f*=#<5a57@-kEM;($4!lK=Uk{)Y(vfmp*i*gN9E zN0icsEB{O6e`ciVn%!E{Yw>iX5iH)RCUz0HPQkMUz*E|Ek{cpPbo|z!_j8I%;WTni z(=CjfdGid3KlFfa3F=~Ld$Y4D&G-U@CT^sMMi|QM1*5#t9=egp@SYe7;6jelNArny z&g>z*JTjiX$aH#ZF@D2f{pcT&p`UvSR>bu?Hu@GOqtp7stL}sNj9~Yecrp3~a$=&~ zPFf31zE%E-B=wOYRGHmhzmW*z`hom!UA#oebqqPEhraT|Pe@^IfsQc7Cm7=r$z3q$ zvta-Y?EwM|6+wYBp2-G@L{*!2c6J9U=_~#|iO?I(Ls* zEZe?H-(25eu^Q?I6Ejy^3|nbz;+rdXc=^=t5U@>jp;hLwo5N1Te*>S+O2!?IGO?IW zDF!7tWo=>19tf+!<^rdQOgTdS4I5^4*bkeUQSaI?{)G)*X&J~-^QZeaCVyFKnX~gN z-QayA=q-#e{cq&6cK{R8%gTmV zWQ>3nZY6@_kz8VK5M65?xV7Vaw$xF2Fko(azq5MhQz@g1q@W$<;;<9%-^QmyY%NUj z94^Hg0AG-~o(`zwEyVW=AnLmq?Nok!2Rg;DnKaybL!)6PJ%`1~`*VGI;nTmFPluss zjCH<}7SaRsx*|%y#6#{gu%P+~6I1b}Y0YX2i~>m7*8pTWe`_v!Yvb`MFzA1SnlJP> zm@iM+_Ah|sH^!;&L7)OZ3aZl*2Fpx3m)?U}qQf5g_>V1CCF@UfSkM-H6jkshGum(nAw)n5LBOZC3gaBkf607mHA`giR;_O>HO<{vVZmUi!#i@&UBe_qFaE{G4x~NC{DB{?5y7=b;QLyw3goN zUi#XZNPQGa%?d{)V1^r5{&#i5|E$gP$rtxnKpRMF6C2c(&d~GOm zx^Fn%Yi7jUgQa?N+lC`Amo$3LjD^&(V%O5gz!aa5f(=n_`*=a>U+rS`Mc`MCuSbS) zX+a*2={}7PMG>D=gK^X{z~KaEd_qUmj-2XMYsXr60VnXgfPG?uKxT6+Wd}@^&vpRW z!TmV1WEfitwMfb8fx9mjN_3 zjW4Q!4Jc04uf)_9iV7ss@mj)%(m&*mAYI%XE2L>H9PYn$8doKb{4WfZ`|7{Js1;TBW$G53qp zGvAZ$qo}!s^0{;uMnpB31{y6mHKr>^_(A0^j2vA*y9VI$j|e2Uzt{aX!x0fyHj~l+ zDfn=oAb(fP=nZXc!1t{yKgI60sA`P?D6mtts1 z!4?Iru@S+Gsv`rdjJcTX`(3P1=y+vDY2S#N^;?c)lathsyMl|fdv*p8wd=C)2IB9} z(*J?{UHGB@^=gl^Y| z5oAW}dTgxO;NywNbPf!9@3vuH>4vA)6Fsv~ytR4!kEM4H`*CyWh`F$5taZ#ZUGV1b z(?&VGcp*I}y~I4dz_o(5aV>oyy?=Od#37+Kx%seTI?^^onT0VO_E~Ijg91-=JUQCg zEz zB3qc@?@H&lE#8X;yU3a_uA#qwJlcsJ#GVG@vP3T@X6{4M$1eXMsZEXT)=5v9$q_yz zB-aa_S|fjTh-%C1(;3z1)F?4dgAM&Zk)B8Y=Z5P>ufetSk@VrH{Ews;r5B9~-?583 zdQOLJ+;Q=-^gw!^E~ds1_nnb+-yiM77XC5CpeNS%USVTu3jlgaP}Gh~rq{*KOY0L( zeX_rz739WvO({3j7X8-NJ`veKYlCubmH)Byy!1diw%lc6SzoHhy?5A-7}H?xlk7xu zP|JfPlI}aB`hP0D!s0~$n#0CvX>@qc^}p8%ys}^~$V3qerkdzC=Ae=>F)~^ zI6QD>2NKkSDtho@qsgPRQcz3cP!idl4f^O075H<~H>B4&A~cfv4MQV$(hsEnApOMH zUN;wiTKWU&E7A+VF*S`j+$`rJo#b zpxD+0mL+}ti9rJ^BWCvI>1}wi*0>DLC@65 zNne(px2D_5OxN4e+tPnS{zKDq4ca!h9TT>tr8Vg(1(=}dRaQd02HWm;#q%N`oE;NT?%ROJ}v#e^cCqo zq|4lR*R}Mn^mo!S>f=}XdE(&wc+eemoG^4~9||0w-vAb;Vdc8r^gsSQ(f z)H?{~F2W33S(?-8SdKO8mqRsQSh#=V0D^HF07XPOJAT*@kDcO@hd@yJx6eyol|C!I zC>7Eb2*!*E;feHLr2j+u`9Orn(&weGNuOil+`=Y0wV;FFmHyw-_c?-cfAQy}Ka;*} zN6R(5%VP+ZKal=U>Bq5o_71>t3fU=0m4z%N0m)=g7HT=psWUSk`BP!JbmaKGLVRZ= z=*m#JDZV8#mF;cgXfh1=*QGy~o~K^htQAabdca=b2mQ~C?(i_(H& z7ZuN(nwfva=)Q-MOQpd3HR)^8pGm*X`9Wd&Iy=+iO!`>*(C26zs!>I-JEJ8xa>k7M z3@+vV^3JJ zb(52PQ~DXxg*T%~SW9*_1BhuEbKKf_S^0nk7mj8}u({z^S6I@@poRU)MfE4Q0H9_N z`qb=M0nhPdGy~4C0sl_=L+Mk}b2Nc$`1W%Y>6EEpn^55+>9f)wOMf7}C7m#uE6~9t z(Kn=f%p?0a7B9tJe=dDR`jqr(>C6V2G|cg6rB|f)rC*K8zlQTPL6(9Uu{3f@WuR!! zd(uJ3?N8GGuS;K-K119W6KvKbo{e-W{akt{hABqq^k>q)m)@}cw+VwwuCSc< zq=(W+vGay~0J+T-o*I}J9qgT1n8y8RLH@X~7=MQo3f!@@1>1iDn-KM`C?(HmpWcBY zjYptU=~d}Zq(75B18H<(_8sKoN}nTGJ|>qf%(0|!@nz{$ z`Y1ifRC-1Fy7VVRFkAL*)lY<5={dtFT3@_Bls+T95dXjh6*f4LjMJo`$1eZS{}_Le zMlP(FXCDlR1q$Bd12#4RcW&TWq|ZloCP*{b0_p!Yc<;*0eM|?6?J&Qs6FABL>(V!+ zm&lzeBj2qJBY9K$l=P3%lThIor9YDXQhJ?OXG!N}M>kIn6!3lNThgQSE)(e$>5rv< zPvUoE=R4;>fDz|C7-f;|7|DMRg-@R)oVQbe&~}u_b0wX_*o`nIpnG8eFR-M&0#1zy z$>}``wF?GB4wrGf6kAEJNPi;z&(a&H(-!17pkp3~Wa3Z}Z{l4@K=?&?Q z^@MEgH0L=0>&w!yU;LW%r_x_aU*aU%HN>5%;b(K{1?eT}So%0D$9@AWKvcgGF^O_M!}pZ{xTRHqN8Im;^hN2L(w7(u z8EIE%2g>$V51KOPiovB*o14C)cVQ=;7)ZAdfBs1N5~gcvMkh%)dR(SvA$p(S zG6WpqqZ~t)H9uTCj9FTNJVunA*f`m!V=suwhjsLT;8sMfIT-SWWpB*e^qxP0@GnR& zN>iRKgLGQrXES!vG7NA|`14ii4U|5&v9v9lyub$a=8&X||3La2IMWPUb4?tZ95$H8rq7P8-Q-_$uJF1g_Uu5~c7STlx?>W`I49-z~W;vb6f} z6*5!d&qz73?-Y%ip(BAS9Qo!?x${@0FG_dp_@-$7t63?pAa^40{jBu+(iez&qw{Tr zz$y^MVl;w|GNnvZHH|+Vy?Apgj`xzN@S)cR80DDZtRLRg1J|GPY)gJS0B2_*PI16G zwEjs8Uz5Hpy&>JT6^g3Rl%e$3G{sh7e@>*&p~8g$KDCVlukArj!-uA5>dVrfN?!sn z%8=rY84fLqTn;8$_aGn|8oW0^r_vkJtCakr0ddN3fXBRAd&uECl|I7+ zzZu^e0OAq8qP2;s;}z`2W^NzzBIih6KxlUld5*1lw)Ah6?U)*lN(&>XH%B6~ZP|kM z1W>)C^XY^sDkChc&0}bJ?32SE5JsMtkZLEjNaG4wV%^^4EmYU_(M6ksf zuMJzj(fpdK^8^Vcz#@Tqg`Eq(v?DQw71)Hqe_FbS)6Q`$QPs)$WjUlC7r$ZoTE#wz z-5S#@p7^PViC*?5Sx1gP6EVP`R?>{hKLhYPmChK&u%;w)&lm_R@QGd}G0cNorSz9M z)sL2tvd3cU3}IcNCly3@5kS@C2f2|`dh%b;Jek=>ZY|u)fCew<*W82%PcR{;q!|r1 zt3WTpK=v|_bnz?Fb0|p+%4s_w8W~CQ0D|XW~rRZly(y1sa+$c|DJd zb0k0GwPTlU7+-jVKkf}A&80Kx4e1rgKdp^To&gF3s$1QbM2awGg(WMve`OAQdx(E! zHcziY+hdD{oN61SG~@twY!!k3Lio*e)?v^CzSh|#c#cawb%hwzQR2hCZeE1o_!^%RW8Yx6!Gv+DuDwze(S&_$g z1i+=`q(kHBlDm{Xl1?o?EDeGTI6F{ejW~d{aeGQTg!Y;zoS`JEK`5JBTqL8c1`IhP zf{TEl_X0s9Ev(^U!n!B-wFQ!09cBO>+a;rX7f{od2RXuqtqC-yfg(isaf+dhRJS&z z4=23ez?q?}Gh;hVP@trAkvZ0f-YY%@W^js%T%p5rY(R@XMACGlP&@2BX|$VYU_LeSOQ18=;RX`BG*IR!NG_!-_FaJ`onUbHEUXwD&~e$$P^=D) zTq*`EJ%WqP6B{*S6=D=iNyw{%D0jcSTP7f^QTWWn5#Yy;(C5sN_@YnFZg}99L*W|B z$xleMGMR;_jWD;8KE#YhPE;C-JCcEAggZ<7Nd@xgrSyqyxh!7(iv1s$)fNYKWaXWi zbccD#nJ>YdF3N~?F2N5o{7!v{0K}tuH%)DH#ukOEtvhf&h(Q`wt^&iq#7h_aLYVa6 zIPDj|;cXiWRAXNj1OqLGbe{p0gVzw2Y;MO1i})fY1?~JqFt;zk8ZT^aSVg?o8N9pa zaD`=vgg5-4cP11@M;M`jcDzpu&AtOS-4O&7L^c*g8RV98=AL7t zu29ttlcSD}h;eiqizAv%agW88)fys#la=+0LG)7^tdpM)Y&{;Ro4hK*pjvfQ2^uNuF z^D<{KJ9sUHb$88t0U`n#@G~3mtYb@F%0umKu^Tm%**ln<3u3}+sE;SN9Y~T>>V877 zbr;hy=dzYDAs-FS6p4g4;)p3i^gLd}W? zbu9{COa5u+5s(ZRNy&G_o|VOJU7wv*`<TG{;q`9`>tEsk!s2`P zeBYkMB@E+!Q-ak2=5gKs>1Qqu`Vo@TdT^fxu@}T`AR7T<261yMq z!fPvoT={{#vH!OuILFoyGP5HgGd9(DBV=Z*k%~yH0M1VAra3k^0*1Zvgx^T&eNS3S z=cKYFs*&5)Qf&SbwCT*^gDq5q45Tgs`@${9{sl72^9q)J4PqNV*&?UC8akD{!CckY z5D$F4oZZFJw@6Q{xa>stIlg9l&=Jzxz?aff=}Z!Fbkf0@0X`FetP;(;`M)sZwULA- zM#RWCB=5-BK{pq-7M&d#3!~%tWszMs8bP$y zJKI_F%5eAOJmCzw=LRbg<>yN(vm+^d1`rboYxuyag`yi)db|joS+C^|8tJKp!V@-7 z&j&^HvBGr6J*VuLc&_uq-S11#TAn>)GY4#x^x~-Av)`^pY{O8l#NVSjb?THZHlP z^iaBwt&Kcvf}F0v_8lf4*#ZGlRaVtov0HPCq4!dV0Bpx^lw|Wcwt8ZuJ{_V^Dcrq_ zl0~gBvy@?GKv<7JzF&Z~6+1*&8Cp{I?M^iaf`yc8-lPP;>-tBF1%$9KAVh}N@k?8t zer9`Z!=E9H9<(7zN75bqeI#W$Vc`UpZROT~XT1kus51td1~WUiT&|b@H{_4U_Qkc? zT1NtRhYqYWAGEb*-l-k#d0|_!8kAxi;*uliC^EEcVilWV3K-8nw+W)M=Dn!TS7s+h z9B?>*n!yLMPU7^h8$&xac#sIWgBB2ActL8LeE35Yq^AO<9RnF)Z2HSNhGm$$`6poT z_~*!uWixzDst257H1Ar4Waz^pjH(b-&W;E- z8kFQ{i`dsN=9H1bPBow)8J0Y!8)u(~$2wgwcy(^6R0f12V(M}EckH6vs3;4T@EEUp zWrxi6;_!%p7k!3Y5FfS%{tgRp4T0t<`!9^zj+Ni^t6KZ7*waA6gRfDhiTU0D`KK6` z{U@V2b!wqtM18^Q2&gT0_ZJve4dOZn5_^$s;+M2zMdq3%4G|Jj!wtBG4X+ z5FhC-P+jmo}vUnxWL_&@YZT3HAVQ-j&5LCq}tkwMl=*b>^a-$#sE3)-PtoY z{9a~J*X7_m#!9*a2d}d*w`0?0tnrpmYI5iUq6V@BsIM(7-oeVt4R0A<-&+1J^M^HD zxJOn!+s80L7tLt6%Zw}#6sj$La|DLYYGd98S%B&9$ww7;gEs0pu)jJ}> z8o#+UU0&J?&x{sxkO8DGOetVyESQ{M+9Bki_w8+XVii0fh3a6DJY_MW{krS73M&ji z&7w#5+SC@XhT%BBZ3E{FP7tL%(a-#+R)Q(aPy4V zH{Et_rxPs=am^3&xA;`J(X_<(Fkq#f4AKDx_98IBb2_mu)u_qJC`@6JhIPn_P)OFj z+|bSwuaggAFW*(@*Qrcj{E!f1&6WibFLyY+79F&>sPR5C7UtHJnCyD zAnP~_r8GyC+08Qw!X><%Ar(Jjg2sk;RLqc1;!KT{h4tlSgNti|lZL*eg6EGeyqQ;E z2M?d(+6$~l1`s$iT%ltx_v+|@zXaNrai1M|(2VPQ0Zaz+IYoX~Y~(3{-Cgc^YXM1? z!H4}QtN}bX@=Rl(*#6?J3ASVXC+vKo*QKrQ{`pUc8!x2>v2D;fC*a_Gbqv%U>Ta%VlfIj&~z z&B)>6w{QR>_6rDTV!-~){zM;1zHwytRz^gM!dZbmJ;s5I@bGIJ zUAVJoHhAzG!F2!t9K=aPK~&foL2wNs()r6ry*fAVTR;p9CT7sthOG!4LHrUbK*f#> z@=#$0pc{0OhpnvfJ5l|Js98p@y*rrZ(AkTud5S6RY=zm-?6wRtIAgVyuQNRMv2FW& zU_c}AU%{4Y@w+n{;u5^F`QSW_9Q-KyL{rr{D5(=Jo>K-Ju|KT=rooY^=&vXbDqKAv z+iPf%c+Jf92FHEfS_=YZq9umiv!mhz$)}uh;;!&Uk+v8{<-XC_RbD zAnl%X@ykRlTMM6Mflpzv?*8JX^oR={SjqfS`jGJp0U%e03e>S~QAZ2vqX=tftW=o9 z9)7olcC{sliH^K${SFVbR33)|iTNSXVA zLh%aM^n4XI&0ztpxieO2uaQSq0c}7rucSL1q8yNX#+Q!NkZO{uE&enRsjcO~A^B|Z z@LSd|;<((Zo;4+{ZomYMgg-iN8-80LA6|-W2NA66e@HG!lT!?J4H#1Zkz+}) zWLhREiKGh4E4NR{hx2~>u9-XUQEnIdV99T0&YZLB+H3C#$5q=rr!|6kiMd`AocA3- z#kUgIpcS&1tmwAaMt$>o{X4=3S>--dr^PECOO^CBGhRCtw8~nr;`SB!*Adi$>0lNd zW?+Y{4_no6E?et_T=h_d291eGBgd0OT6NU*8=csrFdtlkgpRF-6V<@!O)Q}3hS+vT zhIm?+VOVzd)_v_68_rW%8)v-ffc40TJyTe6VT09bj;6qj4siHn<|b&kv5SD4Y9jd;;5#_W$In58M}2ogjX@XR{l2MlONrjz{sf(ST@uxl##Edlvn zi$XRh^nSczI~s$EGLWK+-WTN~dh%j%Si}q*V-CPI&;=PHydu+nMZNk>7PV$yU(xN~ z0fK|7DJ{+--`Y?kU0e&BGyul*dnA!_s>BbMglSpqYc8Kyzj@_xcufA`GY7MBJ zV12!g7{HYK}!)2dY0Iyjf! zpp!LJq{3gfkYU3a?u%eKt@J;zd80ed=1r>bGF+{%z)|{0zX&{c(E3k|ppqX_!K1X5 z^9mkHo|$j2iB&uKXDwZl3!e}K9~l-@5rOEYCV+rpGjfl@Xrx=hgJ4zmgV=BsQwujr zD}NR&r0OX^gxsZ#6A#@E7DtI2%?6o>nGuz1LxFOeMOE-F{^HkvF*lvgNVztM{gK(0 z>0viWj%VExt|TQnC2HF;#A7$;y~Os6NCNH}ewo&CZbh!r?A+dZwN^^5&45NoF(%9J ztfFK-;vF(TVew-x8q)vDHQ#>~VdWYMQ}7upyw}{io3dU26lcDXo*SPs5H%-MDCDm- z+1?uYEosgvFnr)g>z?w&{yCF#EZw7vGj%Yn8K^Qh`QNWUoy6L*v(nngCk7-g4jWNJ zr8nd&QTG@zq9ZpfcIN4GrwANLa$`qVS?!{+2&!=dlHsyyW9vlD6T014Cukc6v?RKI z%SuU6HCzzZp8&0ld-;E?zhoHG6$(7(fssfyRIE?KhDH(l4T8-xlCP+G?dZf)j|Gy^ z6cMz*)}r$%*8j??xx*3Y$T(n)1;m$dHqJ;$HkLA8Z{Y-|xuIU$SWYtsKUx}#sUtuU z5k`F`4t&d7RMHFS87*`J5ZoGmV^lAmLxY>3QWG|-V*sNRcbAYSvxF!}DI;rb=`J+x7V8(IU@AhD=Prf8i&$dfCJ1uqQo%+f+kD5Otv!BS z;^Lx+JL0vY4C`y{UL6AlkTf91KBG}r*x-lbj z3Nt}bRz8Ezd*A@Vj0WK(BkgGHJR6QN{KT~Yl~xC7O+*^|_)kU!usqzl9awAZ$<~1F zy2l>9$$;iWN>f6HInl@n=bjlhd%db4kw@Y%>*k*;`rv1%(H_7q45eLK$tA;91(&F` zaJA3k2o$KbgN-06D3wAh3%lxi=QA5*BMxudJ+Rh_WBc%9L<4q?d=?z|B@yf#7;yl@ z4H@1b?gfv{Ge3+o9LkDt}6B3kw&jGXb^><|PNnyaty&e&*-BU7vjM4^+#1=VL|T_jUR z;A~NbV;dBi`_`A#H6#3wGRRq%pFKNtfUZb+|-WIL37CAfTR; zdyK%QqZ7BY{t*G5Lm-iIB-r9z>8H}$H__kyCN9wV0jCjt{&SjhPV5=b!JszWzegA-km%8d|Mp~!0+i@bFRI?~C? zF!NFOdy-otwsy+@9UEA2#JP0OM#D4~fn{do_HREy^&@af5zrNXkYhkE>?HOIu(INP zlEHIE`JGt?koPW*=SLl~fP;2s#I_u{+46DQjty`YNeXTXLMz|>Z;JOsi*x7R=&BYRZa<+gSXzv0Ey9d*4!FLFdPCaVlXKs(neUjB+*lwsS) zpg38`Pifq&!XTv7Gx*%bpBDVbpdKeUZHrQ5hu$kQyrJbG$Na}&S2o1i z8+U_IzRzHnybGe{73}mP4ixMQuq(cPQ7}I~0MiAq#2TYA_m}S!f!8>WDL!!mon&m# zdo%c15Vw149AW~ajxj@Tz>PePSvnzHQH6KXTbQ@Ps4)W(L<`YvCEW$zPYb>z@QiN! zh{oIA%SX4?2#a-WNMMqOSJJnKUlgyIeK<1m${6OrlD}LYj^XMlKw|~1Sa7IuK_}$K zk=P8oQFd=Pwa-yS^`Rmx+1K$Z7`p{P^8{a!&f+DW_?XOMz<}0fD`d6TV3m%I@1-7@ zF^Xtwvc?4gsSsB0fx;;lY-liYYR7ee?iRbXrlvr&D(e+^4Fuu&Zh^U(kQPP-b)N(_ zgezNuz-Yscrn~~-oxzJ6x#WgOR7h@Z5KfRDYtp)Y2GH6x6PeQtCT30UvBN5KfV|c; z_lmCaDBv~xUu#nF4v#*qe9kD&a~>W|KrIPjbr?GKmQY#YIXq@b=x~BDJ$4kf1!kzN zOxq%MDX@#Sgu;a4T6snQcCt^e&TOzj!TL|Y7Y4>Lm5^_Ou{t3JUek{|reGN9%KqXs z!RAdkgN=H{T*jef%X+46iuhmZt?Sctt^dh4lCLKZ2M7N0VExZ=}-|Hl!1Q2I8Fa zsx`+krTn%-G{(K~O-A5cU|+_lTlk~Q%JtEg)7k$mrY}e7H#Q+9P^Cx9)6e8$TsEnDaxLhUQXGI=fk5Q8iqFQU@ipjDcICo9$NQkeCcstNKd67 z^Vb17pGiyUTUwJ=z1?f1E9t59o%ALs`4~!P3HrCOVchji*1sjg$%t`-O5K9|xPA^`!w$rBxaG!gDTtE{t&I*Vc0O^l zpGo(ncR)F377Au2canu zKS{rq?nysDVv zdmETx`sd)@!3SP-IWy6YInW0XTlkUmF06-;yw>IoRnn_o4KIaN52UxGQ|Xt|U67Lz z@x<6ZE}asM{r+D{f0TYJ-Ism=;iuwh!3WLld!7UPpLTK#&7mL+6?VVEt3~e8DJInT zr?F8|le>0G_CO7#;b689utlaO<1jC&2P}FgfBoA^Po?*zvGniKDT~_>GQ`4C#Qr5F z{rcjsr1zy0=|7~m5!4K&Jp&R=c{tBKu)4;k zndStPA#?>gFySjV*A5^K;KWegz#>KngJ5_bMFi*-*ePfIcDF1QPAj*L^S%zGE&Ly8 zA-$CTl`lw4y4ZeN*9+-8A7`|czK}kUj->abALDnoAU{z+DZ#6jet1u%UrTRD|0ca7 zm81z%wq}gzU%B@9F9f)?RG_kPI*m0qjCv?wP#7vp+cWzBLS)gSo&(J|lB5$OEB^&( zUtmdwy|3@WZ)n~837PYi%{B|1{+aZMv1`{CKbHO*wYhJAS9F!vc*rNxW50MKJ(B(t z|Gbm#(LNEx|AN}{(|y*37j_S(Ga7E8c@xcG%d%CpP)r~&jtla zM%Fr9^4J_>6um!p2b&nuE_^Ay&k4Ur3#D*Ib68N{NDrkiq$@WePodLwcmvE39lue*Eo{ye~}g(!IH-&llqQ_i(^VzQiy^v@WFI_I4qy@M9prf0E9StBMEiAQ^ulJ#Yt*9P-ETDZiC|L>RK8 zdX~dK`=F;h@vAIHuFT_(u+Y(0c1L;%Qj*0EZ%N-OwCD_gaBD`Uv?l5O@ziEaDr>x$ zn8yx)HXroCNrXQmgSe7@$O>1~ac9!G^hfD;(#O)di||rBGIl0K4NO81%HTNzq1W*Whl(&NDltmK@7 zH@x91&kX_1*!Z;jmgvwP8M2=KG?@7rW$g>;iS%_WAML+K7-v26Ck68Pa+PySQk^!L&gsc1)?Qv)C+m$9b#~G#X(fGOVpH>i znR%<_;1<4+7Sf+h%S$tf!VC{zd`<_-$O^hG*nVb*H$cM;Y$hn*6Lcmaa$88YEef`0 zF{S&>`0(}`*kuc_+_3)WCg|WtfxL+%?1X~k*yOh*8?JBK|73qYm!4WDY;F&%5dQ68 z67+&-eq;>+@s73G=^OX|3)`EZQ#HmZ7?yvb`Sb24)1wby15bQzi0nQ9%>jSL%qbg# z4vj>GVhYBMgW4tobn=seo$u$tVUWm}4^V?x&G=yLDZ^gMWNEp};hPpBJ= zyWI2L;nzeCccB4u>up|k`!4h@j$dX9sGIgb+~Uff9FNUxn}(m<5U++?ouGho_CNhR zn!djM*v3PLB@@WS+N9$pp=2L}E+9f&QjK(zxJ11!{Bu^{a|>e^T-b)CutM+Rud}Wq zwYP|H$bV1l{S<9$l7&;q?0h=9e~g1fG+AjbaPCjT{CIB0)v+rk}%8rhY;&Ku=FK$^&7j||dmv0>5awhupobF|~K1;vq; zJ4Qh}=Z>{*{WXU!=6mkn-?W9g_2h25zQy&2tjnlM$2Jh5vJZ@RIw5nTIA+#A){sH9 z)@$EcV2jcxobEYBFY6A7UXYUv^a0$y|2+qw*UJBG_x?W0bNHz2_AA5Ctc;7ZvwKyT z-Y^~eN=QV?4YZ*#-D&{`T~PR3^HKGx_v9s!O>S;HHF^;!{bEoJ*_ z_W$nJCo3A!njQ@?68qqN6?l@s(<&nZ?F;!W zL45`bXag#pbS=m$!A_~ahn)RwoBy{S+h22wsG2)kcWyywX4H(0PeW{pNS{l0upS$8 zn)hI#Bs#;D_W?vkG^6l$LN>A>o6A_yk$uknFW!Q`U`PJ`YV-O#W;X95W4DCSIA?}o zVG_Bw1QE0qHpHSdB3-I;t@!N6o(&%vSVIGGaK5&v&=D(YsN_|zeVu>EF8(7g%WWJ~ z@F&w*+(AI2o>BG6Ac2|QMT&(dozV6Z?Eo5K4Jci`81prU; z_m!)pGOjNG001R)MObuXVRU6WV{&C-bY%cCFflYOFflDMIaDz>Ix{mmG%+nOH99ab zsc!zm0000bbVXQnWMOn=I&E)cX=Zrw%V)2j@T*rqeW_01QC=PRWqnjtBS-< zjh0%m)!qb&y#DXE_rrUg>pAyzo^#!wp65B|xz9lw>T5Dx=ekZuN5`bC1v0w8$N#l| zFJH_6#%1IM{NwsW?+M+d=#Mgg(RLT>YmPdaAUfKA$H$i9)C&)Tua+h30=53vUi5gV z)6sFsYJ;AbxXtKIgw%SO2k?A{c)hDAZWwyDK=m3n$z9;gz<)Wx=F3i zK97rDn4XuUyd8HDdWPjfmWl~7;GB9bK0o>Q5>!m+T4eI$4>*YKr{$7MkNnyG4cKm( z%&j@Ll|z)B@wG0*LkUVNO|WwK5oSMY{|p@k&m94++ z=oca$Jb9_&Z&C3=|9cH3FR4YsLt%eTNhrxbl%=M=E;5eU_oX6LhT_=8k5Ne9ZPxjB zx=?3IBYUQ6W8Td7#i)x@^Cr-@R^yqbiPCNVLqX&B6_U7q#LZ#_gQr_SU}qSUw>#wo0=krNs080m%?wqn2X3?gudse8g@t^$(RorHex zO;weMPNc^}Gt&~3PHm3R4xU_?*u*98DbTTe()(#N953q>Dzr>uoxBUd>SnxIBl<9=G(zbv|{)g5U)vfO86C02xU!Nbc7ga)6r zIj@-<3EZxo|M42X=)Z;Y^rugfdl3vV#e6e!MrPxbjIFBkl&MRZqJ92{2}cq$BWJ%7 z5xl!GWzBeB-b(&^-#%(dmPBRbbiwJ{6y~N`5#19W2iq%$?oxnAbVc}+Rf!6l;0{z! zOi;Cd-5inp^`Do1&xan}Jn@_%!NUmKaftS{Ql_!}tdK03C7i2bVZ`%XhqJ5A6$Si0 znby-r}h@by(UxZ)&Xk0ByPDfdCzi6r%eSRWp+tstlrB{%u!i=u& z9hNv&3P?0Ligfi`)5>xZbKJfsz-zs%iI;V#!CQ?)O(al*?&~GP$jUHa3U{aJqbIc}MbEFjlcP zofoQ8zcs*rhbAVNFE8=Dc2OJj`iOLrt>8Vm_$BsMop;=FHlFxfELpCPRZ!3T_721O z#{{D19uZttbwCt=%l{bhqH9cGS1od70K^)rgKVHd308Vt5UH+r!a0{!!3=*PGC}db36(i|^m- z9j=XT8Ye%6jr|JV9S5u4YKFQ$?@*V4Pn7GP)ec0K0}IN*&T$d? zn5tM|X^DGYTh(qDy}HmUwR;$oIQI_KuUGfqleT*FY6gDa-0_cbb5!MPOR_f1mulGY&+UDQ^ItgqjBjXTp}r}TaRObSQjhkv43_Gde(R@U8I zni~0G-**mQ{XFY+WTFb(E%bA0fUlkF4W|AEp5C8!pi&v~SlHe{%Lq%)=4O!eiXn`p zJSY2q`Gz%;*Dv?ESrkqdCv)x_T+>Kz#ENIM3Ng`8<|y_~^MBpFJ?#2vGUP^dy*WiE zjWm~b*awkgSH@Vnfx<;;o;&aRed5Gd!|`MTCD>&QrUYq?lxRUA)K5=3yR6$c3ZsK| z-ZyI&)T&4q$XEY`T@SP!ba4tDVr$DDdG24y!MXP%t zw0p5%Z6Y=68fe)Kf08?_K(*Cx>bD#0atd`ITWgIJ5za5t&spjr)8HgdAbnNPlVEEAX=#G^Y-a(u;y>%V6P^6o-1}UO5l{;XJf4 zSXXuD->G0<3Gr_OaOCr;f}m3Jmy<%&Odjub#Mx{*#T9b#p_7wup;Y3R*(U9MUbLzgNGbjLJnTI-JF_RQ1pEO~X-V9>8@* z!ORC~cbQ`}+^6k!*Za4BwVH=~3|Hj{jm%T15=$5^?x;pU1vR+ZfoM&AlN0u zqo7?bng;*;&yeW0Iv=FIsEXEY0>9w){jxsQ{ zR7w%?wx>pp(g2)`VANN67vWEDExj$%qjOej2d-O*S}N;X4A?q+FmkMykH|*5l;py| zSt2}H!hP{Dkd#QQkL|9@k8h*L*kO5gg>TNt>3FxmYZItfry-0@Nd1W%n9+y?F3nwY z?jKullV`E8G%-LO*{|9UX_DC<)1DtqY;nN)+JBfelr~t%cX%T0L3lfT+R3bsgl<>_ zHmO67^xmHV&% z=HK(4g{P8xZ?9D2YsQ|RNp_@L72S7;z4HKgd4!w1C3OBeZvTgvX5}xneA=2J#OZN& zeEH*0einl_E~&5%Wlf)9wGtW_&|AcJ@MHYNJ)6EysbqIR^&^FfU*UZ_zrqCKmX!cY z)LXekBB#cC^F6;|gV?bh%KJ5U@3*1hyPulBDDggADl8aDvs)IKWN$Qyk+pdYzg8&< zRkj@@elSqX=}9sT9%X#)Kc|xHl}%pRkcI~Zv$hh$)+=p7K_uTsasfd!>al4vP#G|G6uD5>V5Yv zr>OC%-O`u)8n~?-jqZ{;{MdK{hTqqxKwe^8OH^)asrKf+-SXOfwXgGArN$V)5s#7I z+dI>NnLHtlRa`QKCG3BCuXWV2GO|>uR7`{eAKz;rJQ~LdU^Dtm5X+M|X_n)J6nYJhbdw!WD zR9ZkLGrlQ9?T4$es?3fZ0g7DC)5>MnuyFXo<8GPw*b3=$Djif1oWHYcgm&XY`Ztkh z;(1&eov1zw=97~rwkzDQnJ#6kpuaN}v{1FU{qai8=vS4?LDBVQlqO53vxfoZYy4j{ z1Dd2qDW#3_Z=$#LRpnQX3$A#QN}8-`zPz9D>^AOyqDE&}XOxke%hm&8llAL-BN|48 zM59E-5U1Sa+b4Qc3PJ5`WA8NZ29O!QFN@iiA_t{)t^Dn8X6z(ck>$X%YlEsF1K z?v=beR7Zaz17T|}fSfvvD+5FL#9>EE$su61{p{%Wc{*3EX)LV-a-)NSTHLNS*@eS5 z3P*n)GevEtl35|jI~t05DhEx)0EFEyaX}(I7<>i3k8U4?V5J5^ZO7%K!;eQ7{k5|f z&${dt3+;W}?reO$S^{zuDU7J#@B1uKl&o**7;?uxybQ0$ZBEcxD9xW?;&K;3h|2;O5goW#+ zI;#(uzcR2k{?v5NPZ<6g{&rf*n>1VBA*QL$rDwO4TGXAX?X_bBz_8#ZyIku|NI$qV zeG&u5d?y!`qBcE-QUux5f9P`}8hdUzq&e{;1HBu{^FipfZ_dVT}4f;#E~#hq2zs0rhv__o3S<=K2o;o!p=L)G_S#W1aN2;8rS^NIoz9 zII%ntIFF~5rF*?KYFTHMlyW)`Ap0>#VRSA5Pp!hDdm5s8NXv$4}hDDMiv57s-Jkntw zG&}V>Y@9M>;9{qmf3n-h0~cfM6}W?{royrGMSRmswZ7+f>Xl`MlGJHOJVh;=V zaK5C@jwWkSUi*cI`Av2-H#K6|t@~f>V@DAZiDz`?)q^PpYpuBLD^*SQA5|*cwE0)o zf6+I@n<~(r9tTi6&WxO=iO4Cfuy7{DgBj^LmM&Vynq^5_^jH!|lJ2XF=ekyk-M}GZ zo6Hi8jTV_^&RsD#woXaj-!1bvYU-_NtOy!>OS7VQm<=j4=fbXcdj9ZpGY|?2QrNZ& zrp6-x=eJ5-S4o7*F<&P@>s*5wx1M^GdD;QCa~-qH`4=Guj2q#4|GKzJ3z8mQkF(z{ zCZA}Dh}foQ$*m%kyDYJFgq+u|?a4+4ZEp6vmu*dlOET>s7`asggWWi6UDroTiSho` zfA=?q>6;$T_G=L(o9iI99lkI2+;ikyb1(^FEB%%mkM0sCzN~KxhbIqXp*7CETTi%F zf$YqQg)V+ZuXL4M_>%^(*E~_prJm(7sX1~>D4g@RU-E=J$y1Y~>l#|3>~S*2fxjVz zQO5d`g)U2vCv>saVaO0ixkA0(H~pwC*%2+Gmy$Hv)XZ!|SRY~qg0Ok8dg$hF*q=fx z+!y4mtZ(671)P?~*PD~fQv#1%b+NROE2;NH75<ulkF*`eco?91?(`;w01c|vR}~J6*j|h*YZuN+n~rVKIXMu>uE*1V$WmGr z=qPV?an{A_mUNm373?>9&E*HEqpbV#;)Zv$2P@CKqF!?SEO@(Y@udE2WLQg_RTQvwNKG3_Hshv5UC)urH?-4UBuj2VTn}-*Y z54kp>l^H_1mOpX534WjlLnWO4;w=G)UaxU3WaE#{`B9^C<575+2{$7lESkY1ej@U- z3uRmZu2<9b8}}_&)m5|BxQ@WOzVGWv^3q#y2D7v3?q!(RAavw;33yfy6^AJAs}LS8 z^pESQ6RrJ6VNnbQHx{myJ{x~M@xdrlKNIBNFZ}GOqOXa(abn^EpE%>u(e!WmL17Tc zapH!k+ho>-I9a!%yiOBW{{nwZv;SIt>PIm&<-eDp7y~cn8(+5m4Ln-;7IVn*&r^=~ z-6(gr;w+%*;dkK`a~s-=&6a5TNRD5xik-@$18I4_KGVc3i>1FYpH!MP?ngrlSS4M= zKGb$UT;P31v0lEutMK}v!Z~y0m*{<=Ttm+FP80%NO5ZlVqafvaDzKojWcp}KIMdLk zw|C&-auB@Io$NK!y8-A`2{!erGKS@smoYA<@4%zvJSg?DA-G*$?8&oyEJac{QK}84 zgQxru_31lN9quz~OwOHn-W#{8B(r%&x6cXN$_NkraSPb??%;!o)O<0ggwe#ggZbpj z+w9z>$Pec;@!~Lb&W8p|m&3n=x;9;z6y+_f+&$ymMaxhZQ3&k*q?=t%FQB<<%k^>Y zo$#N_ z2p~c~vXNd8*@1^QtN_LeW;ah}-;RLquA0Hx_z_wzu-P+U!hsdwU2te*cd`9fRAQjo zv4xkp+ov+qd5hP>XudwXS&VRN9f3bRV+bL&_o#PHG+%cO^jl^XOp-oP9aDrJGvqp) zzw1VPnHtHySCsYuscjxuwFjsmLK{wR7~_f0FQiT|QlYnO#;k;_E^`7-(9qb@iAS)X zTw98uB0cTKeD17qFL_X)hnFgx63+Ri5r$ipvgZ~_l;q0?4Xyj_x6-QJU~`DJ(Vkd8z3(bJZFwNgs+lxVKcPP2E zFyE%Xo`N4`Wqqg3C0De`^aeifVP$=j*>{m^Q!u)6(Cvpsu9MDu^jd z(8-@Pf+*}2Ic!aPy81lXd)A6Q$ZN;Sl(|v=Os^vy5nEYjZV9r7@{+Kf=8C~`9z2b< zXI>p+Mu1j7Y&y6iU;kX&)+&ISyYzffuJaEz{HgKis7}b4Vq*@`vWsy?Hav0p1aA)e z#;y?%>G8#JTs^7gA`tctuiWTNn+0cJ#!7e+_sP|>ekMj{1vE!x3Lb1ieXRJ&EU_RMW##q5C#6{~29~6kk=4sCegNSBrr@W@{woFm5 z`)sCg@yv{wc2sca-4xf%5wLT=pRT1h%P?qFSmJp<1dRttXjlYLe-Qc_o`wqzd(9(7 z6RM|VN?M;N9x;`&?5^^V7}aGGJ^V?rUD@-y4Dosd2}gTnNi__8qacA@FqfzAl5o-s zL{_l!P-laJ6X5#mk?QGT^&guhsxo^>BQpqJ;sFq+VJ`}$oa3uLj|ptgZnCow8^N!f zR`jQ76*N3cz$uh=*}brK^2B>(h@D(c{M%ZJ_{=pi#2@r1ruS_J?;K)NRcYmmbIUyc z2;vi)WkPObSg)t|3QjxJc~E1;{_t+asD=en;)r+W*MxyUON7-j^qj#4LFJ*ATVs)6 z`LlR)CfAjn%*Orn#+-E}@(XG)uuUOTPsiM40Q@EuMjoB&61V1_Aw-aVl6tR}F9go! z>sRPKswH)o!#E{*NyK)=HJ_#7I_)5Vd;qUFY zuY+bLbpixRm8mc?1LRe(!$?6MNgbtXfMJdZEas0Fl^ic2XWt5cin&4Q5GL5e)lPPm zrIAxCATTn#k|ApInW};Ni}0eJ1^acwy`b2-X9cO|7QBSdeqHe^CfTj>JbJR5(uti_ zAm-u<-_NJl#35Hin6zBFHwv_LqoMbV0=)IJtJM>E>Crr0J(9ypM9)GFfB1q{d}*>i z24{x!1Ms$s9tw{oD+%W%;0&XW1SP&Tz@o$j%>GaeYgM9E(pmi+f%*4;O(vc^^9#Su_XJIJhg|X#bh)FK2bpxB8P5e|mClUtE}RMV